From b048d8462652159c5314d19b191220b0ec384edb Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@u.ibm.com>
Date: Tue, 5 Feb 2008 08:52:45 -0500
Subject: [PATCH 01/13] jbd2: Add error check to journal_wait_on_commit_record
 to avoid oops

The buffer head pointer passed to journal_wait_on_commit_record() could
be NULL if the previous journal_submit_commit_record() failed or journal
has already aborted.

Looking at the jbd2 debug messages, before the oops happened, the jbd2
is aborted due to trying to access the next log block beyond the end
of device. This might be caused by using a corrupted image.

We need to check the error returns from journal_submit_commit_record()
and avoid calling journal_wait_on_commit_record() in the failure case.

This addresses Kernel Bugzilla #9849

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4f302d279279..48b3cb8aeb2e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -872,7 +872,8 @@ wait_for_iobuf:
 		if (err)
 			__jbd2_journal_abort_hard(journal);
 	}
-	err = journal_wait_on_commit_record(cbh);
+	if (!err && !is_journal_aborted(journal))
+		err = journal_wait_on_commit_record(cbh);
 
 	if (err)
 		jbd2_journal_abort(journal, err);

From 5315217efea54a07950758005686adedb8e8e680 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 1 Feb 2008 08:26:46 -0500
Subject: [PATCH 02/13] [PATCH] jbd: Remove useless loop when writing commit
 record

Commit block was intended to have several copies of the header. But
due to a bug it never had them and actually, nobody checks that. So
just remove the useless loop.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd/commit.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8e08efcaede2..a38c7186c570 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -104,7 +104,8 @@ static int journal_write_commit_record(journal_t *journal,
 {
 	struct journal_head *descriptor;
 	struct buffer_head *bh;
-	int i, ret;
+	journal_header_t *header;
+	int ret;
 	int barrier_done = 0;
 
 	if (is_journal_aborted(journal))
@@ -116,13 +117,10 @@ static int journal_write_commit_record(journal_t *journal,
 
 	bh = jh2bh(descriptor);
 
-	/* AKPM: buglet - add `i' to tmp! */
-	for (i = 0; i < bh->b_size; i += 512) {
-		journal_header_t *tmp = (journal_header_t*)bh->b_data;
-		tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
-		tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
-		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-	}
+	header = (journal_header_t *)(bh->b_data);
+	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 
 	JBUFFER_TRACE(descriptor, "write commit block");
 	set_buffer_dirty(bh);

From c4b8e635f525441b9cb0bab428b527858d977e8f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 5 Feb 2008 10:55:26 -0500
Subject: [PATCH 03/13] jbd2: Fix reference counting on the journal commit
 block's buffer head

With journal checksum patch we added asynchronous commits of journal
commit headers, and accidentally dropped taking a reference on the
buffer head.

(Before the change, sync_dirty_buffer did the get_bh(). The associative
put_bh is done by journal_wait_on_commit_record().)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 48b3cb8aeb2e..d6ea623b1e23 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -136,7 +136,7 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "submit commit block");
 	lock_buffer(bh);
-
+	get_bh(bh);
 	set_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
 	bh->b_end_io = journal_end_buffer_io_sync;

From 4d605179723a3fb8ba594d9516897426e6629a5b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 5 Feb 2008 10:56:15 -0500
Subject: [PATCH 04/13] JBD2: Use the incompat macro for testing the incompat
 feature.

JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT needs to be checked with
JBD2_HAS_INCOMPAT_FEATURE

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c   | 2 +-
 fs/jbd2/recovery.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d6ea623b1e23..c35bf16f44f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -142,7 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
-		!JBD2_HAS_COMPAT_FEATURE(journal,
+		!JBD2_HAS_INCOMPAT_FEATURE(journal,
 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 		set_buffer_ordered(bh);
 		barrier_done = 1;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d36356f7d222..146411387ada 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -641,7 +641,7 @@ static int do_one_pass(journal_t *journal,
 				if (chksum_err) {
 					info->end_transaction = next_commit_ID;
 
-					if (!JBD2_HAS_COMPAT_FEATURE(journal,
+					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
 						printk(KERN_ERR
 						       "JBD: Transaction %u "

From b8356c465b42c162f34b5fd4102a6c27cec36f43 Mon Sep 17 00:00:00 2001
From: Valerie Clement <valerie.clement@bull.net>
Date: Tue, 5 Feb 2008 10:56:37 -0500
Subject: [PATCH 05/13] ext4: Don't set EXTENTS_FL flag for fast symlinks

For fast symbolic links, the file content is stored in the i_block[]
array, which is not compatible with the new file extents format.
e2fsck reports error on such files because EXTENTS_FL is set.
Don't set the EXTENTS_FL flag when creating fast symlinks.

In the case of file migration, skip fast symbolic links.

Signed-off-by: Valerie Clement <valerie.clement@bull.net>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 6 ++++++
 fs/ext4/namei.c   | 1 +
 2 files changed, 7 insertions(+)

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 3ebc2332f52e..9ee1f7cfb2c5 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -414,6 +414,12 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
 		return -EINVAL;
 
+	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+		/*
+		 * don't migrate fast symlink
+		 */
+		return retval;
+
 	down_write(&EXT4_I(inode)->i_data_sem);
 	handle = ext4_journal_start(inode,
 					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d153bb5922fc..a9347fb43bcc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2223,6 +2223,7 @@ retry:
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
 		inode->i_size = l-1;
+		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
 	}
 	EXT4_I(inode)->i_disksize = inode->i_size;
 	err = ext4_add_nondir(handle, dentry, inode);

From 42a10add852e6291a7544afd8a286622a3e6ae76 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 10 Feb 2008 01:07:28 -0500
Subject: [PATCH 06/13] ext4: Fix null bh pointer dereference in mballoc

Repoted by Adrian Bunk <bunk@kernel.org>:

The Coverity checker spotted the following NULL dereference:

static int ext4_mb_mark_diskspace_used
{
	...
	if (!bitmap_bh)
		goto out_err;
	...
out_err:
	sb->s_dirt = 1;
	put_bh(bitmap_bh);
	...

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 76e5fedc0a0b..06d1f5292d3a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3069,7 +3069,7 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 
 out_err:
 	sb->s_dirt = 1;
-	put_bh(bitmap_bh);
+	brelse(bitmap_bh);
 	return err;
 }
 

From 0040d9875dcccfcb2131417b10fbd9841bc5f05b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 5 Feb 2008 22:36:43 -0500
Subject: [PATCH 07/13] allow in-inode EAs on ext4 root inode

The ext3 root inode was treated specially with respect
to in-inode extended attributes, for reasons detailed
in the removed comment below.  The first mkfs-created
inodes would not get extra_i_size or the EXT3_STATE_XATTR
flag set in ext3_read_inode, which disallowed reading or
setting in-inode EAs on the root.

However, in ext4, ext4_mark_inode_dirty calls
ext4_expand_extra_isize for all inodes; once this is done
EAs may be placed in the root ext4 inode body.

But for reasons above, it won't be found after a reboot.

testcase:

setfattr -n user.name -v value mntpt/
setfattr -n user.name2 -v value2 mntpt/
umount mntpt/; remount mntpt/
getfattr -d mntpt/

name2/value2 has gone missing; debugfs shows it in the
inode body, but it is not found there by getattr.

The following fixes it up; newer mkfs appears to properly
zero the inodes, so this workaround isn't needed for ext4.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f4e387452246..bbfabf876e78 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2758,13 +2758,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
-	if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
-	    EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-		/*
-		 * When mke2fs creates big inodes it does not zero out
-		 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
-		 * so ignore those first few inodes.
-		 */
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 		    EXT4_INODE_SIZE(inode->i_sb)) {

From 8009f9fb3067fef6c2ca0c16f6bac786ae28639d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 10 Feb 2008 01:20:05 -0500
Subject: [PATCH 08/13] ext4: Fix circular locking dependency with migrate and
 rm.

In order to prevent a circular locking dependency when an unlink
operation is racing with an ext4 migration, we delay taking i_data_sem
until just before switch the inode format, and use i_mutex to prevent
writes and truncates during the first part of the migration operation.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 117 +++++++++++++++++++++++++++++-----------------
 1 file changed, 74 insertions(+), 43 deletions(-)

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 9ee1f7cfb2c5..8c6c685b9d22 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -61,10 +61,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
 		retval = ext4_journal_restart(handle, needed);
 		if (retval)
 			goto err_out;
-	}
-	if (needed) {
+	} else if (needed) {
 		retval = ext4_journal_extend(handle, needed);
-		if (retval != 0) {
+		if (retval) {
 			/*
 			 * IF not able to extend the journal restart the journal
 			 */
@@ -220,6 +219,26 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
 
 }
 
+static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
+{
+	int retval = 0, needed;
+
+	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+		return 0;
+	/*
+	 * We are freeing a blocks. During this we touch
+	 * superblock, group descriptor and block bitmap.
+	 * So allocate a credit of 3. We may update
+	 * quota (user and group).
+	 */
+	needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+
+	if (ext4_journal_extend(handle, needed) != 0)
+		retval = ext4_journal_restart(handle, needed);
+
+	return retval;
+}
+
 static int free_dind_blocks(handle_t *handle,
 				struct inode *inode, __le32 i_data)
 {
@@ -234,11 +253,14 @@ static int free_dind_blocks(handle_t *handle,
 
 	tmp_idata = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
-		if (tmp_idata[i])
+		if (tmp_idata[i]) {
+			extend_credit_for_blkdel(handle, inode);
 			ext4_free_blocks(handle, inode,
 					le32_to_cpu(tmp_idata[i]), 1, 1);
+		}
 	}
 	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
 	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
 	return 0;
 }
@@ -267,29 +289,32 @@ static int free_tind_blocks(handle_t *handle,
 		}
 	}
 	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
 	ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
 	return 0;
 }
 
-static int free_ind_block(handle_t *handle, struct inode *inode)
+static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
 {
 	int retval;
-	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (ei->i_data[EXT4_IND_BLOCK])
+	/* ei->i_data[EXT4_IND_BLOCK] */
+	if (i_data[0]) {
+		extend_credit_for_blkdel(handle, inode);
 		ext4_free_blocks(handle, inode,
-				le32_to_cpu(ei->i_data[EXT4_IND_BLOCK]), 1, 1);
+				le32_to_cpu(i_data[0]), 1, 1);
+	}
 
-	if (ei->i_data[EXT4_DIND_BLOCK]) {
-		retval = free_dind_blocks(handle, inode,
-						ei->i_data[EXT4_DIND_BLOCK]);
+	/* ei->i_data[EXT4_DIND_BLOCK] */
+	if (i_data[1]) {
+		retval = free_dind_blocks(handle, inode, i_data[1]);
 		if (retval)
 			return retval;
 	}
 
-	if (ei->i_data[EXT4_TIND_BLOCK]) {
-		retval = free_tind_blocks(handle, inode,
-						ei->i_data[EXT4_TIND_BLOCK]);
+	/* ei->i_data[EXT4_TIND_BLOCK] */
+	if (i_data[2]) {
+		retval = free_tind_blocks(handle, inode, i_data[2]);
 		if (retval)
 			return retval;
 	}
@@ -297,15 +322,13 @@ static int free_ind_block(handle_t *handle, struct inode *inode)
 }
 
 static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
-				struct inode *tmp_inode, int retval)
+				struct inode *tmp_inode)
 {
+	int retval;
+	__le32	i_data[3];
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode);
 
-	retval = free_ind_block(handle, inode);
-	if (retval)
-		goto err_out;
-
 	/*
 	 * One credit accounted for writing the
 	 * i_data field of the original inode
@@ -317,6 +340,11 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 			goto err_out;
 	}
 
+	i_data[0] = ei->i_data[EXT4_IND_BLOCK];
+	i_data[1] = ei->i_data[EXT4_DIND_BLOCK];
+	i_data[2] = ei->i_data[EXT4_TIND_BLOCK];
+
+	down_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * We have the extent map build with the tmp inode.
 	 * Now copy the i_data across
@@ -336,8 +364,15 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
 	spin_lock(&inode->i_lock);
 	inode->i_blocks += tmp_inode->i_blocks;
 	spin_unlock(&inode->i_lock);
+	up_write(&EXT4_I(inode)->i_data_sem);
 
+	/*
+	 * We mark the inode dirty after, because we decrement the
+	 * i_blocks when freeing the indirect meta-data blocks
+	 */
+	retval = free_ind_block(handle, inode, i_data);
 	ext4_mark_inode_dirty(handle, inode);
+
 err_out:
 	return retval;
 }
@@ -365,6 +400,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 		}
 	}
 	put_bh(bh);
+	extend_credit_for_blkdel(handle, inode);
 	ext4_free_blocks(handle, inode, block, 1, 1);
 	return retval;
 }
@@ -420,7 +456,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 		 */
 		return retval;
 
-	down_write(&EXT4_I(inode)->i_data_sem);
 	handle = ext4_journal_start(inode,
 					EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
@@ -454,13 +489,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	ext4_orphan_add(handle, tmp_inode);
 	ext4_journal_stop(handle);
 
-	ei = EXT4_I(inode);
-	i_data = ei->i_data;
-	memset(&lb, 0, sizeof(lb));
-
-	/* 32 bit block address 4 bytes */
-	max_entries = inode->i_sb->s_blocksize >> 2;
-
 	/*
 	 * start with one credit accounted for
 	 * superblock modification.
@@ -469,7 +497,20 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 * trascation that created the inode. Later as and
 	 * when we add extents we extent the journal
 	 */
+	/*
+	 * inode_mutex prevent write and truncate on the file. Read still goes
+	 * through. We take i_data_sem in ext4_ext_swap_inode_data before we
+	 * switch the inode format to prevent read.
+	 */
+	mutex_lock(&(inode->i_mutex));
 	handle = ext4_journal_start(inode, 1);
+
+	ei = EXT4_I(inode);
+	i_data = ei->i_data;
+	memset(&lb, 0, sizeof(lb));
+
+	/* 32 bit block address 4 bytes */
+	max_entries = inode->i_sb->s_blocksize >> 2;
 	for (i = 0; i < EXT4_NDIR_BLOCKS; i++, blk_count++) {
 		if (i_data[i]) {
 			retval = update_extent_range(handle, tmp_inode,
@@ -507,19 +548,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
 	 */
 	retval = finish_range(handle, tmp_inode, &lb);
 err_out:
-	/*
-	 * We are either freeing extent information or indirect
-	 * blocks. During this we touch superblock, group descriptor
-	 * and block bitmap. Later we mark the tmp_inode dirty
-	 * via ext4_ext_tree_init. So allocate a credit of 4
-	 * We may update quota (user and group).
-	 *
-	 * FIXME!! we may be touching bitmaps in different block groups.
-	 */
-	if (ext4_journal_extend(handle,
-			4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)) != 0)
-		ext4_journal_restart(handle,
-				4 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
 	if (retval)
 		/*
 		 * Failure case delete the extent information with the
@@ -528,7 +556,11 @@ err_out:
 		free_ext_block(handle, tmp_inode);
 	else
 		retval = ext4_ext_swap_inode_data(handle, inode,
-							tmp_inode, retval);
+							tmp_inode);
+
+	/* We mark the tmp_inode dirty via ext4_ext_tree_init. */
+	if (ext4_journal_extend(handle, 1) != 0)
+		ext4_journal_restart(handle, 1);
 
 	/*
 	 * Mark the tmp_inode as of size zero
@@ -556,8 +588,7 @@ err_out:
 	tmp_inode->i_nlink = 0;
 
 	ext4_journal_stop(handle);
-
-	up_write(&EXT4_I(inode)->i_data_sem);
+	mutex_unlock(&(inode->i_mutex));
 
 	if (tmp_inode)
 		iput(tmp_inode);

From 7fb5409df092589b86cc9412d926879cb572b7f0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 10 Feb 2008 01:08:38 -0500
Subject: [PATCH 09/13] ext4: Fix Direct I/O locking

We cannot start transaction in ext4_direct_IO() and just let it last
during the whole write because dio_get_page() acquires mmap_sem which
ranks above transaction start (e.g. because we have dependency chain
mmap_sem->PageLock->journal_start, or because we update atime while
holding mmap_sem) and thus deadlocks could happen. We solve the problem
by starting a transaction separately for each ext4_get_block() call.

We *could* have a problem that we allocate a block and before its data
are written out the machine crashes and thus we expose stale data. But
that does not happen because for hole-filling generic code falls back to
buffered writes and for file extension, we add inode to orphan list and
thus in case of crash, journal replay will truncate inode back to the
original size.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 107 ++++++++++++++++++++++++------------------------
 1 file changed, 53 insertions(+), 54 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bbfabf876e78..7dd9b50d5ebc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,7 +892,16 @@ out:
 	return err;
 }
 
-#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+/*
+ * Number of credits we need for writing DIO_MAX_BLOCKS:
+ * We need sb + group descriptor + bitmap + inode -> 4
+ * For B blocks with A block pointers per block we need:
+ * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
+ * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ */
+#define DIO_CREDITS 25
 
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
@@ -939,49 +948,31 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	int ret = 0;
+	int ret = 0, started = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
-	if (!create)
-		goto get_block;		/* A read */
-
-	if (max_blocks == 1)
-		goto get_block;		/* A single block get */
-
-	if (handle->h_transaction->t_state == T_LOCKED) {
-		/*
-		 * Huge direct-io writes can hold off commits for long
-		 * periods of time.  Let this commit run.
-		 */
-		ext4_journal_stop(handle);
-		handle = ext4_journal_start(inode, DIO_CREDITS);
-		if (IS_ERR(handle))
+	if (create && !handle) {
+		/* Direct IO write... */
+		if (max_blocks > DIO_MAX_BLOCKS)
+			max_blocks = DIO_MAX_BLOCKS;
+		handle = ext4_journal_start(inode, DIO_CREDITS +
+			      2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-		goto get_block;
-	}
-
-	if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
-		/*
-		 * Getting low on buffer credits...
-		 */
-		ret = ext4_journal_extend(handle, DIO_CREDITS);
-		if (ret > 0) {
-			/*
-			 * Couldn't extend the transaction.  Start a new one.
-			 */
-			ret = ext4_journal_restart(handle, DIO_CREDITS);
+			goto out;
 		}
+		started = 1;
 	}
 
-get_block:
-	if (ret == 0) {
-		ret = ext4_get_blocks_wrap(handle, inode, iblock,
+	ret = ext4_get_blocks_wrap(handle, inode, iblock,
 					max_blocks, bh_result, create, 0);
-		if (ret > 0) {
-			bh_result->b_size = (ret << inode->i_blkbits);
-			ret = 0;
-		}
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
 	}
+	if (started)
+		ext4_journal_stop(handle);
+out:
 	return ret;
 }
 
@@ -1671,7 +1662,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * if the machine crashes during the write.
  *
  * If the O_DIRECT write is intantiating holes inside i_size and the machine
- * crashes then stale disk data _may_ be exposed inside the file.
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
  */
 static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
@@ -1680,7 +1672,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	handle_t *handle = NULL;
+	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
@@ -1688,17 +1680,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
 
-		handle = ext4_journal_start(inode, DIO_CREDITS);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
 		if (final_size > inode->i_size) {
+			/* Credits for sb + inode write */
+			handle = ext4_journal_start(inode, 2);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				goto out;
+			}
 			ret = ext4_orphan_add(handle, inode);
-			if (ret)
-				goto out_stop;
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out;
+			}
 			orphan = 1;
 			ei->i_disksize = inode->i_size;
+			ext4_journal_stop(handle);
 		}
 	}
 
@@ -1706,18 +1702,21 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
 
-	/*
-	 * Reacquire the handle: ext4_get_block() can restart the transaction
-	 */
-	handle = ext4_journal_current_handle();
-
-out_stop:
-	if (handle) {
+	if (orphan) {
 		int err;
 
-		if (orphan && inode->i_nlink)
+		/* Credits for sb + inode write */
+		handle = ext4_journal_start(inode, 2);
+		if (IS_ERR(handle)) {
+			/* This is really bad luck. We've written the data
+			 * but cannot extend i_size. Bail out and pretend
+			 * the write failed... */
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		if (inode->i_nlink)
 			ext4_orphan_del(handle, inode);
-		if (orphan && ret > 0) {
+		if (ret > 0) {
 			loff_t end = offset + ret;
 			if (end > inode->i_size) {
 				ei->i_disksize = end;

From c4e35e07af162ea4d642b1c6ffacbb63c3ed1804 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Sun, 10 Feb 2008 01:09:32 -0500
Subject: [PATCH 10/13] JBD2:  Clear buffer_ordered flag for barried IO request
 on success

In JBD2 jbd2_journal_write_commit_record(), clear the buffer_ordered
flag for the bh after barried IO has succeed. This prevents later, if
the same buffer head were submitted to the underlying device, which has
been reconfigured to not support barrier request, the JBD2 commit code
could treat it as a normal IO (without barrier).

This is a port from JBD/ext3 fix from Neil Brown.

More details from Neil:

Some devices - notably dm and md - can change their behaviour in
response to BIO_RW_BARRIER requests.  They might start out accepting
such requests but on reconfiguration, they find out that they cannot
any more. JBD2 deal with this by always testing if BIO_RW_BARRIER
requests fail with EOPNOTSUPP, and retrying the write
requests without the barrier (probably after waiting for any pending
writes to complete).

However there is a bug in the handling this in JBD2 for ext4 .

When ext4/JBD2 to submit a BIO_RW_BARRIER request,
it sets the buffer_ordered flag on the buffer head.
If the request completes successfully, the flag STAYS SET.

Other code might then write the same buffer_head after the device has
been reconfigured to not accept barriers.  This write will then fail,
but the "other code" is not ready to handle EOPNOTSUPP errors and the
error will be treated as fatal.

Cc:  Neil Brown <neilb@suse.de>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c35bf16f44f4..a8173081f831 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -148,6 +148,8 @@ static int journal_submit_commit_record(journal_t *journal,
 		barrier_done = 1;
 	}
 	ret = submit_bh(WRITE, bh);
+	if (barrier_done)
+		clear_buffer_ordered(bh);
 
 	/* is it possible for another commit to fail at roughly
 	 * the same time as this one?  If so, we don't want to
@@ -166,7 +168,6 @@ static int journal_submit_commit_record(journal_t *journal,
 		spin_unlock(&journal->j_state_lock);
 
 		/* And try again, without the barrier */
-		clear_buffer_ordered(bh);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);

From 256bdb497c6f562462f1e89fc8e1409f61ef40cb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 10 Feb 2008 01:13:33 -0500
Subject: [PATCH 11/13] ext4: allocate struct ext4_allocation_context from a
 kmem cache

struct ext4_allocation_context is rather large, and this bloats
the stack of many functions which use it.  Allocating it from
a named slab cache will alleviate this.

For example, with this change (on top of the noinline patch sent earlier):

-ext4_mb_new_blocks		200
+ext4_mb_new_blocks		 40

-ext4_mb_free_blocks		344
+ext4_mb_free_blocks		168

-ext4_mb_release_inode_pa	216
+ext4_mb_release_inode_pa	 40

-ext4_mb_release_group_pa	192
+ext4_mb_release_group_pa	 24

Most of these stack-allocated structs are actually used only for
mballoc history; and in those cases often a smaller struct would do.
So changing that may be another way around it, at least for those
functions, if preferred.  For now, in those cases where the ac
is only for history, an allocation failure simply skips the history
recording, and does not cause any other failures.


Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 127 ++++++++++++++++++++++++++++++----------------
 1 file changed, 82 insertions(+), 45 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 06d1f5292d3a..5e3c35191412 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -420,6 +420,7 @@
 #define MB_DEFAULT_GROUP_PREALLOC	512
 
 static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
 
 #ifdef EXT4_BB_MAX_BLOCKS
 #undef EXT4_BB_MAX_BLOCKS
@@ -2959,12 +2960,19 @@ int __init init_ext4_mballoc(void)
 	if (ext4_pspace_cachep == NULL)
 		return -ENOMEM;
 
+	ext4_ac_cachep =
+		kmem_cache_create("ext4_alloc_context",
+				     sizeof(struct ext4_allocation_context),
+				     0, SLAB_RECLAIM_ACCOUNT, NULL);
+	if (ext4_ac_cachep == NULL) {
+		kmem_cache_destroy(ext4_pspace_cachep);
+		return -ENOMEM;
+	}
 #ifdef CONFIG_PROC_FS
 	proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs);
 	if (proc_root_ext4 == NULL)
 		printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT);
 #endif
-
 	return 0;
 }
 
@@ -2972,6 +2980,7 @@ void exit_ext4_mballoc(void)
 {
 	/* XXX: synchronize_rcu(); */
 	kmem_cache_destroy(ext4_pspace_cachep);
+	kmem_cache_destroy(ext4_ac_cachep);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry(EXT4_ROOT, proc_root_fs);
 #endif
@@ -3699,7 +3708,7 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 				struct buffer_head *bitmap_bh,
 				struct ext4_prealloc_space *pa)
 {
-	struct ext4_allocation_context ac;
+	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned long end;
@@ -3715,9 +3724,13 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
-	ac.ac_sb = sb;
-	ac.ac_inode = pa->pa_inode;
-	ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = pa->pa_inode;
+		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
+	}
 
 	while (bit < end) {
 		bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit);
@@ -3733,11 +3746,13 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 				(unsigned) group);
 		free += next - bit;
 
-		ac.ac_b_ex.fe_group = group;
-		ac.ac_b_ex.fe_start = bit;
-		ac.ac_b_ex.fe_len = next - bit;
-		ac.ac_b_ex.fe_logical = 0;
-		ext4_mb_store_history(&ac);
+		if (ac) {
+			ac->ac_b_ex.fe_group = group;
+			ac->ac_b_ex.fe_start = bit;
+			ac->ac_b_ex.fe_len = next - bit;
+			ac->ac_b_ex.fe_logical = 0;
+			ext4_mb_store_history(ac);
+		}
 
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
@@ -3751,6 +3766,8 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 	}
 	BUG_ON(free != pa->pa_free);
 	atomic_add(free, &sbi->s_mb_discarded);
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
 
 	return err;
 }
@@ -3758,12 +3775,15 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 				struct ext4_prealloc_space *pa)
 {
-	struct ext4_allocation_context ac;
+	struct ext4_allocation_context *ac;
 	struct super_block *sb = e4b->bd_sb;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	ac.ac_op = EXT4_MB_HISTORY_DISCARD;
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+
+	if (ac)
+		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3771,13 +3791,16 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
 	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
 
-	ac.ac_sb = sb;
-	ac.ac_inode = NULL;
-	ac.ac_b_ex.fe_group = group;
-	ac.ac_b_ex.fe_start = bit;
-	ac.ac_b_ex.fe_len = pa->pa_len;
-	ac.ac_b_ex.fe_logical = 0;
-	ext4_mb_store_history(&ac);
+	if (ac) {
+		ac->ac_sb = sb;
+		ac->ac_inode = NULL;
+		ac->ac_b_ex.fe_group = group;
+		ac->ac_b_ex.fe_start = bit;
+		ac->ac_b_ex.fe_len = pa->pa_len;
+		ac->ac_b_ex.fe_logical = 0;
+		ext4_mb_store_history(ac);
+		kmem_cache_free(ext4_ac_cachep, ac);
+	}
 
 	return 0;
 }
@@ -4231,7 +4254,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 				 struct ext4_allocation_request *ar, int *errp)
 {
-	struct ext4_allocation_context ac;
+	struct ext4_allocation_context *ac = NULL;
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
@@ -4257,53 +4280,60 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	inquota = ar->len;
 
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (!ac) {
+		*errp = -ENOMEM;
+		return 0;
+	}
+
 	ext4_mb_poll_new_transaction(sb, handle);
 
-	*errp = ext4_mb_initialize_context(&ac, ar);
+	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
 		goto out;
 	}
 
-	ac.ac_op = EXT4_MB_HISTORY_PREALLOC;
-	if (!ext4_mb_use_preallocated(&ac)) {
+	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
+	if (!ext4_mb_use_preallocated(ac)) {
 
-		ac.ac_op = EXT4_MB_HISTORY_ALLOC;
-		ext4_mb_normalize_request(&ac, ar);
+		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
+		ext4_mb_normalize_request(ac, ar);
 
 repeat:
 		/* allocate space in core */
-		ext4_mb_regular_allocator(&ac);
+		ext4_mb_regular_allocator(ac);
 
 		/* as we've just preallocated more space than
 		 * user requested orinally, we store allocated
 		 * space in a special descriptor */
-		if (ac.ac_status == AC_STATUS_FOUND &&
-				ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len)
-			ext4_mb_new_preallocation(&ac);
+		if (ac->ac_status == AC_STATUS_FOUND &&
+				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
+			ext4_mb_new_preallocation(ac);
 	}
 
-	if (likely(ac.ac_status == AC_STATUS_FOUND)) {
-		ext4_mb_mark_diskspace_used(&ac, handle);
+	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+		ext4_mb_mark_diskspace_used(ac, handle);
 		*errp = 0;
-		block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex);
-		ar->len = ac.ac_b_ex.fe_len;
+		block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+		ar->len = ac->ac_b_ex.fe_len;
 	} else {
-		freed  = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len);
+		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
 		if (freed)
 			goto repeat;
 		*errp = -ENOSPC;
-		ac.ac_b_ex.fe_len = 0;
+		ac->ac_b_ex.fe_len = 0;
 		ar->len = 0;
-		ext4_mb_show_ac(&ac);
+		ext4_mb_show_ac(ac);
 	}
 
-	ext4_mb_release_context(&ac);
+	ext4_mb_release_context(ac);
 
 out:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
 
+	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4407,7 +4437,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head *bitmap_bh = 0;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_allocation_context ac;
+	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	unsigned long overflow;
@@ -4436,9 +4466,12 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 	ext4_debug("freeing block %lu\n", block);
 
-	ac.ac_op = EXT4_MB_HISTORY_FREE;
-	ac.ac_inode = inode;
-	ac.ac_sb = sb;
+	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+	if (ac) {
+		ac->ac_op = EXT4_MB_HISTORY_FREE;
+		ac->ac_inode = inode;
+		ac->ac_sb = sb;
+	}
 
 do_more:
 	overflow = 0;
@@ -4504,10 +4537,12 @@ do_more:
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 
-	ac.ac_b_ex.fe_group = block_group;
-	ac.ac_b_ex.fe_start = bit;
-	ac.ac_b_ex.fe_len = count;
-	ext4_mb_store_history(&ac);
+	if (ac) {
+		ac->ac_b_ex.fe_group = block_group;
+		ac->ac_b_ex.fe_start = bit;
+		ac->ac_b_ex.fe_len = count;
+		ext4_mb_store_history(ac);
+	}
 
 	if (metadata) {
 		/* blocks being freed are metadata. these blocks shouldn't
@@ -4548,5 +4583,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
+	if (ac)
+		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }

From 26346ff681cb42c1436ed09c44dcae4809470dab Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 10 Feb 2008 01:10:04 -0500
Subject: [PATCH 12/13] ext4: Don't panic in case of corrupt bitmap

Multiblock allocator calls BUG_ON in many case if the free and used
blocks count obtained looking at the bitmap is different from what
the allocator internally accounted for. Use ext4_error in such case
and don't panic the system.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5e3c35191412..dd0fcfcb35ce 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -681,7 +681,6 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 {
 	char *bb;
 
-	/* FIXME!! is this needed */
 	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
 	BUG_ON(max == NULL);
 
@@ -965,7 +964,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		printk(KERN_DEBUG
+		ext4_error(sb, __FUNCTION__,
 			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
 			group, free, grp->bb_free);
 		grp->bb_free = free;
@@ -1822,13 +1821,24 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		i = ext4_find_next_zero_bit(bitmap,
 						EXT4_BLOCKS_PER_GROUP(sb), i);
 		if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
-			BUG_ON(free != 0);
+			/*
+			 * IF we corrupt the bitmap  we won't find any
+			 * free blocks even though group info says we
+			 * we have free blocks
+			 */
+			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
+					"group info. But bitmap says 0\n",
+					free);
 			break;
 		}
 
 		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
-		BUG_ON(free < ex.fe_len);
+		if (free < ex.fe_len) {
+			ext4_error(sb, __FUNCTION__, "%d free blocks as per "
+					"group info. But got %d blocks\n",
+					free, ex.fe_len);
+		}
 
 		ext4_mb_measure_extent(ac, &ex, e4b);
 
@@ -3363,13 +3373,10 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 	ac->ac_pa = pa;
 
 	/* we don't correct pa_pstart or pa_plen here to avoid
-	 * possible race when tte group is being loaded concurrently
+	 * possible race when the group is being loaded concurrently
 	 * instead we correct pa later, after blocks are marked
-	 * in on-disk bitmap -- see ext4_mb_release_context() */
-	/*
-	 * FIXME!! but the other CPUs can look at this particular
-	 * pa and think that it have enought free blocks if we
-	 * don't update pa_free here right ?
+	 * in on-disk bitmap -- see ext4_mb_release_context()
+	 * Other CPUs are prevented from allocating from this pa by lg_mutex
 	 */
 	mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
 }
@@ -3758,13 +3765,13 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b,
 		bit = next + 1;
 	}
 	if (free != pa->pa_free) {
-		printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n",
+		printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free);
+		ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n",
+						free, pa->pa_free);
 	}
-	BUG_ON(free != pa->pa_free);
 	atomic_add(free, &sbi->s_mb_discarded);
 	if (ac)
 		kmem_cache_free(ext4_ac_cachep, ac);
@@ -4435,7 +4442,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 			unsigned long block, unsigned long count,
 			int metadata, unsigned long *freed)
 {
-	struct buffer_head *bitmap_bh = 0;
+	struct buffer_head *bitmap_bh = NULL;
 	struct super_block *sb = inode->i_sb;
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;

From 469108ff3dcbc00313699d620c47f3ee1e7d19c6 Mon Sep 17 00:00:00 2001
From: Theodore Tso <tytso@MIT.EDU>
Date: Sun, 10 Feb 2008 01:11:44 -0500
Subject: [PATCH 13/13] ext4: Add new "development flag" to the ext4 filesystem

This flag is simply a generic "this is a crash/burn test filesystem"
marker.  If it is set, then filesystem code which is "in development"
will be allowed to mount the filesystem.  Filesystem code which is not
considered ready for prime-time will check for this flag, and if it is
not set, it will refuse to touch the filesystem.

As we start rolling ext4 out to distro's like Fedora, et. al, this makes
it less likely that a user might accidentally start using ext4 on a
production filesystem; a bad thing, since that will essentially make it
be unfsckable until e2fsprogs catches up.

Signed-off-by: Theodore Tso <tytso@MIT.EDU>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/super.c         | 11 +++++++++++
 include/linux/ext4_fs.h |  7 +++++++
 2 files changed, 18 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 93beb865c20d..0072da75221f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1919,6 +1919,17 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_WARNING
 		       "EXT4-fs warning: feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended\n");
+
+	/*
+	 * Since ext4 is still considered development code, we require
+	 * that the TEST_FILESYS flag in s->flags be set.
+	 */
+	if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
+		printk(KERN_WARNING "EXT4-fs: %s: not marked "
+		       "OK to use with test code.\n", sb->s_id);
+		goto failed_mount;
+	}
+
 	/*
 	 * Check feature flags regardless of the revision level, since we
 	 * previously didn't change the revision level when setting the flags,
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index c4f635a4dd25..250032548597 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -489,6 +489,13 @@ do {									       \
 #define	EXT4_ERROR_FS			0x0002	/* Errors detected */
 #define	EXT4_ORPHAN_FS			0x0004	/* Orphans being recovered */
 
+/*
+ * Misc. filesystem flags
+ */
+#define EXT2_FLAGS_SIGNED_HASH		0x0001  /* Signed dirhash in use */
+#define EXT2_FLAGS_UNSIGNED_HASH	0x0002  /* Unsigned dirhash in use */
+#define EXT2_FLAGS_TEST_FILESYS		0x0004	/* to test development code */
+
 /*
  * Mount flags
  */