ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
Theodore Ts'o 2009-01-03 20:27:38 -05:00
parent d7cfa4684d
commit 30773840c1
7 changed files with 91 additions and 8 deletions

View File

@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time.
nodelalloc Disable delayed allocation. Blocks are allocation
when data is copied from user to page cache.
max_batch_time=usec Maximum amount of time ext4 should wait for
additional filesystem operations to be batch
together with a synchronous write operation.
Since a synchronous write operation is going to
force a commit and then a wait for the I/O
complete, it doesn't cost much, and can be a
huge throughput win, we wait for a small amount
of time to see if any other transactions can
piggyback on the synchronous write. The
algorithm used is designed to automatically tune
for the speed of the disk, by measuring the
amount of time (on average) that it takes to
finish committing a transaction. Call this time
the "commit time". If the time that the
transactoin has been running is less than the
commit time, ext4 will try sleeping for the
commit time to see if other operations will join
the transaction. The commit time is capped by
the max_batch_time, which defaults to 15000us
(15ms). This optimization can be turned off
entirely by setting max_batch_time to 0.
min_batch_time=usec This parameter sets the commit time (as
described above) to be at least min_batch_time.
It defaults to zero microseconds. Increasing
this parameter may improve the throughput of
multi-threaded, synchronous workloads on very
fast disks, at the cost of increasing latency.
Data Mode
=========
There are 3 different data modes:

View File

@ -328,6 +328,7 @@ struct ext4_mount_options {
uid_t s_resuid;
gid_t s_resgid;
unsigned long s_commit_interval;
u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA
int s_jquota_fmt;
char *s_qf_names[MAXQUOTAS];
@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEFM_JMODE_ORDERED 0x0040
#define EXT4_DEFM_JMODE_WBACK 0x0060
/*
* Default journal batch times
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
* Structure of a directory entry
*/

View File

@ -74,6 +74,8 @@ struct ext4_sb_info {
struct journal_s *s_journal;
struct list_head s_orphan;
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
struct block_device *journal_bdev;
#ifdef CONFIG_JBD2_DEBUG
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */

View File

@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
#endif
if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation");
if (sbi->s_commit_interval) {
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ));
}
if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
seq_printf(seq, ",min_batch_time=%u",
(unsigned) sbi->s_min_batch_time);
}
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
seq_printf(seq, ",max_batch_time=%u",
(unsigned) sbi->s_min_batch_time);
}
/*
* We're changing the default of barrier mount option, so
* let's always display its mount state so it's clear what its
@ -874,7 +883,8 @@ enum {
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore,
@ -913,6 +923,8 @@ static const match_table_t tokens = {
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"},
@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
option = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * option;
break;
case Opt_max_batch_time:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
if (option == 0)
option = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_max_batch_time = option;
break;
case Opt_min_batch_time:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
sbi->s_min_batch_time = option;
break;
case Opt_data_journal:
data_opt = EXT4_MOUNT_JOURNAL_DATA;
goto datacheck;
@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
set_opt(sbi->s_mount_opt, RESERVATION);
set_opt(sbi->s_mount_opt, BARRIER);
@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_commit_interval)
journal->j_commit_interval = sbi->s_commit_interval;
/* We could also set up an ext4-specific default for the commit
* interval here, but for now we'll just fall back to the jbd
* default. */
journal->j_commit_interval = sbi->s_commit_interval;
journal->j_min_batch_time = sbi->s_min_batch_time;
journal->j_max_batch_time = sbi->s_max_batch_time;
spin_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER))
@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_resuid = sbi->s_resuid;
old_opts.s_resgid = sbi->s_resgid;
old_opts.s_commit_interval = sbi->s_commit_interval;
old_opts.s_min_batch_time = sbi->s_min_batch_time;
old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
@ -3178,6 +3209,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_resuid = old_opts.s_resuid;
sbi->s_resgid = old_opts.s_resgid;
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {

View File

@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
spin_lock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;

View File

@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
trans_time = ktime_to_ns(ktime_sub(ktime_get(),
transaction->t_start_time));
commit_time = max_t(u64, commit_time,
1000*journal->j_min_batch_time);
commit_time = min_t(u64, commit_time,
1000*jiffies_to_usecs(1));
1000*journal->j_max_batch_time);
if (trans_time < commit_time) {
ktime_t expires = ktime_add_ns(ktime_get(),

View File

@ -956,6 +956,14 @@ struct journal_s
*/
u64 j_average_commit_time;
/*
* minimum and maximum times that we should wait for
* additional filesystem operations to get batched into a
* synchronous handle in microseconds
*/
u32 j_min_batch_time;
u32 j_max_batch_time;
/* This function is called when a transaction is closed */
void (*j_commit_callback)(journal_t *,
transaction_t *);