Lots of cleanups and bug fixes this cycle, primarily in the block

allocation, extent management, fast commit, and journalling.
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmbsGRcACgkQ8vlZVpUN
 gaP+pwgAop3LUpOFQ9dPRTR3+37AJI8adfabfLIDkEkoVA7lyYY/6Q8pcQ0rklq3
 wE1WxrJ7MaE1GaFCwRIDIL6TP+uYRK0pPjqbFBxGakhDc+WXrTcALOWWofb7J7PL
 FLwP264lRRfKfpMHdK8bx6YHnEN8425PR+ZNXGVPsw+wjo72mmnq54w+ct1iOKiw
 dKfIrwwCGKlBsNdYHS/XsSx7MMK8e7nsKoSq0UtpJ4PqF11/asOtlYYODc4hd27U
 E3I3UDKuntmz+meAscDejOJqQk5FT184HIt/Y5JfetKU2zpUFj9IKqXDzMjijdaj
 vGn9RkTXfJdxMPm1ouF2R6KIRJollg==
 =V7+A
 -----END PGP SIGNATURE-----

Merge tag 'ext4_for_linus-6.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Lots of cleanups and bug fixes this cycle, primarily in the block
  allocation, extent management, fast commit, and journalling"

* tag 'ext4_for_linus-6.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (93 commits)
  ext4: convert EXT4_B2C(sbi->s_stripe) users to EXT4_NUM_B2C
  ext4: check stripe size compatibility on remount as well
  ext4: fix i_data_sem unlock order in ext4_ind_migrate()
  ext4: remove the special buffer dirty handling in do_journal_get_write_access
  ext4: fix a potential assertion failure due to improperly dirtied buffer
  ext4: hoist ext4_block_write_begin and replace the __block_write_begin
  ext4: persist the new uptodate buffers in ext4_journalled_zero_new_buffers
  ext4: dax: keep orphan list before truncate overflow allocated blocks
  ext4: fix error message when rejecting the default hash
  ext4: save unnecessary indentation in ext4_ext_create_new_leaf()
  ext4: make some fast commit functions reuse extents path
  ext4: refactor ext4_swap_extents() to reuse extents path
  ext4: get rid of ppath in convert_initialized_extent()
  ext4: get rid of ppath in ext4_ext_handle_unwritten_extents()
  ext4: get rid of ppath in ext4_ext_convert_to_initialized()
  ext4: get rid of ppath in ext4_convert_unwritten_extents_endio()
  ext4: get rid of ppath in ext4_split_convert_extents()
  ext4: get rid of ppath in ext4_split_extent()
  ext4: get rid of ppath in ext4_force_split_extent_at()
  ext4: get rid of ppath in ext4_split_extent_at()
  ...
This commit is contained in:
Linus Torvalds 2024-09-20 19:26:45 -07:00
commit 056f8c437d
25 changed files with 991 additions and 996 deletions

View File

@ -212,16 +212,6 @@ When mounting an ext4 filesystem, the following option are accepted:
that ext4's inode table readahead algorithm will pre-read into the that ext4's inode table readahead algorithm will pre-read into the
buffer cache. The default value is 32 blocks. buffer cache. The default value is 32 blocks.
nouser_xattr
Disables Extended User Attributes. See the attr(5) manual page for
more information about extended attributes.
noacl
This option disables POSIX Access Control List support. If ACL support
is enabled in the kernel configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL
is enabled by default on mount. See the acl(5) manual page for more
information about acl.
bsddf (*) bsddf (*)
Make 'df' act like BSD. Make 'df' act like BSD.

View File

@ -18,15 +18,17 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
int ext4_inode_bitmap_csum_verify(struct super_block *sb, int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp, struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz) struct buffer_head *bh)
{ {
__u32 hi; __u32 hi;
__u32 provided, calculated; __u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
if (!ext4_has_metadata_csum(sb)) if (!ext4_has_metadata_csum(sb))
return 1; return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
@ -40,14 +42,16 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
void ext4_inode_bitmap_csum_set(struct super_block *sb, void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp, struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz) struct buffer_head *bh)
{ {
__u32 csum; __u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
if (!ext4_has_metadata_csum(sb)) if (!ext4_has_metadata_csum(sb))
return; return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)

View File

@ -280,12 +280,20 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
struct fscrypt_str de_name = struct fscrypt_str de_name =
FSTR_INIT(de->name, FSTR_INIT(de->name,
de->name_len); de->name_len);
u32 hash;
u32 minor_hash;
if (IS_CASEFOLDED(inode)) {
hash = EXT4_DIRENT_HASH(de);
minor_hash = EXT4_DIRENT_MINOR_HASH(de);
} else {
hash = 0;
minor_hash = 0;
}
/* Directory is encrypted */ /* Directory is encrypted */
err = fscrypt_fname_disk_to_usr(inode, err = fscrypt_fname_disk_to_usr(inode,
EXT4_DIRENT_HASH(de), hash, minor_hash, &de_name, &fstr);
EXT4_DIRENT_MINOR_HASH(de),
&de_name, &fstr);
de_name = fstr; de_name = fstr;
fstr.len = save_len; fstr.len = save_len;
if (err) if (err)

View File

@ -1058,6 +1058,7 @@ struct ext4_inode_info {
/* Number of ongoing updates on this inode */ /* Number of ongoing updates on this inode */
atomic_t i_fc_updates; atomic_t i_fc_updates;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
/* Fast commit wait queue for this inode */ /* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait; wait_queue_head_t i_fc_wait;
@ -1106,6 +1107,10 @@ struct ext4_inode_info {
/* mballoc */ /* mballoc */
atomic_t i_prealloc_active; atomic_t i_prealloc_active;
/* allocation reservation info for delalloc */
/* In case of bigalloc, this refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
struct rb_root i_prealloc_node; struct rb_root i_prealloc_node;
rwlock_t i_prealloc_lock; rwlock_t i_prealloc_lock;
@ -1122,10 +1127,6 @@ struct ext4_inode_info {
/* ialloc */ /* ialloc */
ext4_group_t i_last_alloc_group; ext4_group_t i_last_alloc_group;
/* allocation reservation info for delalloc */
/* In case of bigalloc, this refer to clusters rather than blocks */
unsigned int i_reserved_data_blocks;
/* pending cluster reservations for bigalloc file systems */ /* pending cluster reservations for bigalloc file systems */
struct ext4_pending_tree i_pending_tree; struct ext4_pending_tree i_pending_tree;
@ -1149,7 +1150,6 @@ struct ext4_inode_info {
*/ */
struct list_head i_rsv_conversion_list; struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work; struct work_struct i_rsv_conversion_work;
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock; spinlock_t i_block_reservation_lock;
@ -2338,9 +2338,9 @@ struct ext4_dir_entry_2 {
((struct ext4_dir_entry_hash *) \ ((struct ext4_dir_entry_hash *) \
(((void *)(entry)) + \ (((void *)(entry)) + \
((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \ #define EXT4_DIRENT_MINOR_HASH(entry) \
le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)
static inline bool ext4_hash_in_dirent(const struct inode *inode) static inline bool ext4_hash_in_dirent(const struct inode *inode)
{ {
@ -2462,6 +2462,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_HALF_MD4_UNSIGNED 4
#define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_TEA_UNSIGNED 5
#define DX_HASH_SIPHASH 6 #define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
const void *address, unsigned int length) const void *address, unsigned int length)
@ -2695,10 +2696,10 @@ struct mmpd_data {
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb, void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp, struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz); struct buffer_head *bh);
int ext4_inode_bitmap_csum_verify(struct super_block *sb, int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp, struct ext4_group_desc *gdp,
struct buffer_head *bh, int sz); struct buffer_head *bh);
void ext4_block_bitmap_csum_set(struct super_block *sb, void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp, struct ext4_group_desc *gdp,
struct buffer_head *bh); struct buffer_head *bh);
@ -3712,11 +3713,12 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num, int num,
struct ext4_ext_path *path); struct ext4_ext_path *path);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, extern struct ext4_ext_path *ext4_ext_insert_extent(
struct ext4_ext_path **, handle_t *handle, struct inode *inode,
struct ext4_extent *, int); struct ext4_ext_path *path,
struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path **, struct ext4_ext_path *,
int flags); int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *); extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_ext_check_inode(struct inode *inode);
@ -3853,6 +3855,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh); return buffer_uptodate(bh);
} }
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSBADCRC EBADMSG /* Bad CRC detected */

File diff suppressed because it is too large Load Diff

View File

@ -558,8 +558,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1)) if (ext4_es_is_hole(es1))
return 1; return 1;
/* we need to check delayed extent is without unwritten status */ /* we need to check delayed extent */
if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) if (ext4_es_is_delayed(es1))
return 1; return 1;
return 0; return 0;
@ -848,11 +848,12 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
*/ */
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk, ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status) unsigned int status, int flags)
{ {
struct extent_status newes; struct extent_status newes;
ext4_lblk_t end = lblk + len - 1; ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0; int err1 = 0, err2 = 0, err3 = 0;
int resv_used = 0, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL; struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL; struct extent_status *es2 = NULL;
@ -862,21 +863,14 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return; return;
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino); lblk, len, pblk, status, flags, inode->i_ino);
if (!len) if (!len)
return; return;
BUG_ON(end < lblk); BUG_ON(end < lblk);
WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
if ((status & EXTENT_STATUS_DELAYED) &&
(status & EXTENT_STATUS_WRITTEN)) {
ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
" delayed and written which can potentially "
" cause data loss.", lblk, len);
WARN_ON(1);
}
newes.es_lblk = lblk; newes.es_lblk = lblk;
newes.es_len = len; newes.es_len = len;
@ -894,11 +888,11 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true); es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2) if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true); es2 = __es_alloc_extent(true);
if ((err1 || err2 || err3) && revise_pending && !pr) if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
pr = __alloc_pending(true); pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock); write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, end, NULL, es1); err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
if (err1 != 0) if (err1 != 0)
goto error; goto error;
/* Free preallocated extent if it didn't get used. */ /* Free preallocated extent if it didn't get used. */
@ -922,16 +916,38 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (revise_pending) { if (revise_pending) {
err3 = __revise_pending(inode, lblk, len, &pr); err3 = __revise_pending(inode, lblk, len, &pr);
if (err3 != 0) if (err3 < 0)
goto error; goto error;
if (pr) { if (pr) {
__free_pending(pr); __free_pending(pr);
pr = NULL; pr = NULL;
} }
pending = err3;
} }
error: error:
write_unlock(&EXT4_I(inode)->i_es_lock); write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2 || err3) /*
* Reduce the reserved cluster count to reflect successful deferred
* allocation of delayed allocated clusters or direct allocation of
* clusters discovered to be delayed allocated. Once allocated, a
* cluster is not included in the reserved count.
*
* When direct allocating (from fallocate, filemap, DIO, or clusters
* allocated when delalloc has been disabled by ext4_nonda_switch())
* an extent either 1) contains delayed blocks but start with
* non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
* allocated blocks which belong to delayed allocated clusters when
* bigalloc feature is enabled, quota has already been claimed by
* ext4_mb_new_blocks(), so release the quota reservations made for
* any previously delayed allocated clusters instead of claim them
* again.
*/
resv_used += pending;
if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
if (err1 || err2 || err3 < 0)
goto retry; goto retry;
ext4_es_print_tree(inode); ext4_es_print_tree(inode);
@ -1051,7 +1067,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
} }
struct rsvd_count { struct rsvd_count {
int ndelonly; int ndelayed;
bool first_do_lblk_found; bool first_do_lblk_found;
ext4_lblk_t first_do_lblk; ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk; ext4_lblk_t last_do_lblk;
@ -1077,10 +1093,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node; struct rb_node *node;
rc->ndelonly = 0; rc->ndelayed = 0;
/* /*
* for bigalloc, note the first delonly block in the range has not * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of * been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial * the region to be removed, if any, and note that there's no partial
* cluster to track * cluster to track
@ -1100,9 +1116,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
} }
/* /*
* count_rsvd - count the clusters containing delayed and not unwritten * count_rsvd - count the clusters containing delayed blocks in a range
* (delonly) blocks in a range within an extent and add to * within an extent and add to the running tally in rsvd_count
* the running tally in rsvd_count
* *
* @inode - file containing extent * @inode - file containing extent
* @lblk - first block in range * @lblk - first block in range
@ -1119,13 +1134,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu; ext4_lblk_t i, end, nclu;
if (!ext4_es_is_delonly(es)) if (!ext4_es_is_delayed(es))
return; return;
WARN_ON(len <= 0); WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) { if (sbi->s_cluster_ratio == 1) {
rc->ndelonly += (int) len; rc->ndelayed += (int) len;
return; return;
} }
@ -1135,7 +1150,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = lblk + (ext4_lblk_t) len - 1; end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
/* record the first block of the first delonly extent seen */ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) { if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i; rc->first_do_lblk = i;
rc->first_do_lblk_found = true; rc->first_do_lblk_found = true;
@ -1149,7 +1164,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking * doesn't start with it, count it and stop tracking
*/ */
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
rc->ndelonly++; rc->ndelayed++;
rc->partial = false; rc->partial = false;
} }
@ -1159,7 +1174,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/ */
if (EXT4_LBLK_COFF(sbi, i) != 0) { if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) { if (end >= EXT4_LBLK_CFILL(sbi, i)) {
rc->ndelonly++; rc->ndelayed++;
rc->partial = false; rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1; i = EXT4_LBLK_CFILL(sbi, i) + 1;
} }
@ -1167,11 +1182,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/* /*
* if the current cluster starts on a cluster boundary, count the * if the current cluster starts on a cluster boundary, count the
* number of whole delonly clusters in the extent * number of whole delayed clusters in the extent
*/ */
if ((i + sbi->s_cluster_ratio - 1) <= end) { if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits; nclu = (end - i + 1) >> sbi->s_cluster_bits;
rc->ndelonly += nclu; rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits; i += nclu << sbi->s_cluster_bits;
} }
@ -1231,10 +1246,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data * @rc - pointer to reserved count data
* *
* The number of reservations to be released is equal to the number of * The number of reservations to be released is equal to the number of
* clusters containing delayed and not unwritten (delonly) blocks within * clusters containing delayed blocks within the range, minus the number of
* the range, minus the number of clusters still containing delonly blocks * clusters still containing delayed blocks at the ends of the range, and
* at the ends of the range, and minus the number of pending reservations * minus the number of pending reservations within the range.
* within the range.
*/ */
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es, struct extent_status *right_es,
@ -1245,33 +1259,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node; struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu; ext4_lblk_t first_lclu, last_lclu;
bool left_delonly, right_delonly, count_pending; bool left_delayed, right_delayed, count_pending;
struct extent_status *es; struct extent_status *es;
if (sbi->s_cluster_ratio > 1) { if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */ /* count any remaining partial cluster */
if (rc->partial) if (rc->partial)
rc->ndelonly++; rc->ndelayed++;
if (rc->ndelonly == 0) if (rc->ndelayed == 0)
return 0; return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/* /*
* decrease the delonly count by the number of clusters at the * decrease the delayed count by the number of clusters at the
* ends of the range that still contain delonly blocks - * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved * these clusters still need to be reserved
*/ */
left_delonly = right_delonly = false; left_delayed = right_delayed = false;
es = rc->left_es; es = rc->left_es;
while (es && ext4_es_end(es) >= while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
if (ext4_es_is_delonly(es)) { if (ext4_es_is_delayed(es)) {
rc->ndelonly--; rc->ndelayed--;
left_delonly = true; left_delayed = true;
break; break;
} }
node = rb_prev(&es->rb_node); node = rb_prev(&es->rb_node);
@ -1279,7 +1293,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break; break;
es = rb_entry(node, struct extent_status, rb_node); es = rb_entry(node, struct extent_status, rb_node);
} }
if (right_es && (!left_delonly || first_lclu != last_lclu)) { if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) { if (end < ext4_es_end(right_es)) {
es = right_es; es = right_es;
} else { } else {
@ -1289,9 +1303,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
} }
while (es && es->es_lblk <= while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
if (ext4_es_is_delonly(es)) { if (ext4_es_is_delayed(es)) {
rc->ndelonly--; rc->ndelayed--;
right_delonly = true; right_delayed = true;
break; break;
} }
node = rb_next(&es->rb_node); node = rb_next(&es->rb_node);
@ -1305,21 +1319,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/* /*
* Determine the block range that should be searched for * Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the * pending reservations, if any. Clusters on the ends of the
* original removed range containing delonly blocks are * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not * excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation * possible to determine if an associated pending reservation
* should be released with the information available in the * should be released with the information available in the
* extents status tree. * extents status tree.
*/ */
if (first_lclu == last_lclu) { if (first_lclu == last_lclu) {
if (left_delonly | right_delonly) if (left_delayed | right_delayed)
count_pending = false; count_pending = false;
else else
count_pending = true; count_pending = true;
} else { } else {
if (left_delonly) if (left_delayed)
first_lclu++; first_lclu++;
if (right_delonly) if (right_delayed)
last_lclu--; last_lclu--;
if (first_lclu <= last_lclu) if (first_lclu <= last_lclu)
count_pending = true; count_pending = true;
@ -1330,13 +1344,13 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/* /*
* a pending reservation found between first_lclu and last_lclu * a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one * represents an allocated cluster that contained at least one
* delonly block, so the delonly total must be reduced by one * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released * for each pending reservation found and released
*/ */
if (count_pending) { if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu); pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) { while (pr && pr->lclu <= last_lclu) {
rc->ndelonly--; rc->ndelayed--;
node = rb_next(&pr->rb_node); node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root); rb_erase(&pr->rb_node, &tree->root);
__free_pending(pr); __free_pending(pr);
@ -1347,7 +1361,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
} }
} }
} }
return rc->ndelonly; return rc->ndelayed;
} }
@ -1940,7 +1954,7 @@ static struct pending_reservation *__get_pending(struct inode *inode,
* @lblk - logical block in the cluster to be added * @lblk - logical block in the cluster to be added
* @prealloc - preallocated pending entry * @prealloc - preallocated pending entry
* *
* Returns 0 on successful insertion and -ENOMEM on failure. If the * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully. * pending reservation is already in the set, returns successfully.
*/ */
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
@ -1984,6 +1998,7 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
rb_link_node(&pr->rb_node, parent, p); rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root); rb_insert_color(&pr->rb_node, &tree->root);
ret = 1;
out: out:
return ret; return ret;
@ -2105,7 +2120,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true); es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2) if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true); es2 = __es_alloc_extent(true);
if (err1 || err2 || err3) { if (err1 || err2 || err3 < 0) {
if (lclu_allocated && !pr1) if (lclu_allocated && !pr1)
pr1 = __alloc_pending(true); pr1 = __alloc_pending(true);
if (end_allocated && !pr2) if (end_allocated && !pr2)
@ -2135,7 +2150,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
if (lclu_allocated) { if (lclu_allocated) {
err3 = __insert_pending(inode, lblk, &pr1); err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0) if (err3 < 0)
goto error; goto error;
if (pr1) { if (pr1) {
__free_pending(pr1); __free_pending(pr1);
@ -2144,7 +2159,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
} }
if (end_allocated) { if (end_allocated) {
err3 = __insert_pending(inode, end, &pr2); err3 = __insert_pending(inode, end, &pr2);
if (err3 != 0) if (err3 < 0)
goto error; goto error;
if (pr2) { if (pr2) {
__free_pending(pr2); __free_pending(pr2);
@ -2153,7 +2168,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
} }
error: error:
write_unlock(&EXT4_I(inode)->i_es_lock); write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2 || err3) if (err1 || err2 || err3 < 0)
goto retry; goto retry;
ext4_es_print_tree(inode); ext4_es_print_tree(inode);
@ -2161,94 +2176,6 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
return; return;
} }
/*
* __es_delayed_clu - count number of clusters containing blocks that
* are delayed only
*
* @inode - file containing block range
* @start - logical block defining start of range
* @end - logical block defining end of range
*
* Returns the number of clusters containing only delayed (not delayed
* and unwritten) blocks in the range specified by @start and @end. Any
* cluster or part of a cluster within the range and containing a delayed
* and not unwritten block within the range is counted as a whole cluster.
*/
static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
unsigned long long last_counted_lclu;
unsigned int n = 0;
/* guaranteed to be unequal to any ext4_lblk_t value */
last_counted_lclu = ~0ULL;
es = __es_tree_search(&tree->root, start);
while (es && (es->es_lblk <= end)) {
if (ext4_es_is_delonly(es)) {
if (es->es_lblk <= start)
first_lclu = EXT4_B2C(sbi, start);
else
first_lclu = EXT4_B2C(sbi, es->es_lblk);
if (ext4_es_end(es) >= end)
last_lclu = EXT4_B2C(sbi, end);
else
last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
if (first_lclu == last_counted_lclu)
n += last_lclu - first_lclu;
else
n += last_lclu - first_lclu + 1;
last_counted_lclu = last_lclu;
}
node = rb_next(&es->rb_node);
if (!node)
break;
es = rb_entry(node, struct extent_status, rb_node);
}
return n;
}
/*
* ext4_es_delayed_clu - count number of clusters containing blocks that
* are both delayed and unwritten
*
* @inode - file containing block range
* @lblk - logical block defining start of range
* @len - number of blocks in range
*
* Locking for external use of __es_delayed_clu().
*/
unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len)
{
struct ext4_inode_info *ei = EXT4_I(inode);
ext4_lblk_t end;
unsigned int n;
if (len == 0)
return 0;
end = lblk + len - 1;
WARN_ON(end < lblk);
read_lock(&ei->i_es_lock);
n = __es_delayed_clu(inode, lblk, end);
read_unlock(&ei->i_es_lock);
return n;
}
/* /*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster * __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending * reservations for a specified block range depending
@ -2263,7 +2190,9 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
* *
* Used after a newly allocated extent is added to the extents status tree. * Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten * Requires that the extents in the range have either written or unwritten
* status. Must be called while holding i_es_lock. * status. Must be called while holding i_es_lock. Returns number of new
* inserts pending cluster on insert pendings, returns 0 on remove pendings,
* return -ENOMEM on failure.
*/ */
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_lblk_t len,
@ -2273,6 +2202,7 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end = lblk + len - 1; ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last; ext4_lblk_t first, last;
bool f_del = false, l_del = false; bool f_del = false, l_del = false;
int pendings = 0;
int ret = 0; int ret = 0;
if (len == 0) if (len == 0)
@ -2294,49 +2224,53 @@ static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) { if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk); first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk) if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly, f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1); first, lblk - 1);
if (f_del) { if (f_del) {
ret = __insert_pending(inode, first, prealloc); ret = __insert_pending(inode, first, prealloc);
if (ret < 0) if (ret < 0)
goto out; goto out;
pendings += ret;
} else { } else {
last = EXT4_LBLK_CMASK(sbi, end) + last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1; sbi->s_cluster_ratio - 1;
if (last != end) if (last != end)
l_del = __es_scan_range(inode, l_del = __es_scan_range(inode,
&ext4_es_is_delonly, &ext4_es_is_delayed,
end + 1, last); end + 1, last);
if (l_del) { if (l_del) {
ret = __insert_pending(inode, last, prealloc); ret = __insert_pending(inode, last, prealloc);
if (ret < 0) if (ret < 0)
goto out; goto out;
pendings += ret;
} else } else
__remove_pending(inode, last); __remove_pending(inode, last);
} }
} else { } else {
first = EXT4_LBLK_CMASK(sbi, lblk); first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk) if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly, f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1); first, lblk - 1);
if (f_del) { if (f_del) {
ret = __insert_pending(inode, first, prealloc); ret = __insert_pending(inode, first, prealloc);
if (ret < 0) if (ret < 0)
goto out; goto out;
pendings += ret;
} else } else
__remove_pending(inode, first); __remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end) if (last != end)
l_del = __es_scan_range(inode, &ext4_es_is_delonly, l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last); end + 1, last);
if (l_del) { if (l_del) {
ret = __insert_pending(inode, last, prealloc); ret = __insert_pending(inode, last, prealloc);
if (ret < 0) if (ret < 0)
goto out; goto out;
pendings += ret;
} else } else
__remove_pending(inode, last); __remove_pending(inode, last);
} }
out: out:
return ret; return (ret < 0) ? ret : pendings;
} }

View File

@ -42,6 +42,10 @@ enum {
#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)
/*
* Besides EXTENT_STATUS_REFERENCED, all these extent type masks
* are exclusive, only one type can be set at a time.
*/
#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B)
@ -51,7 +55,9 @@ enum {
#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_UNWRITTEN | \
EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_DELAYED | \
EXTENT_STATUS_HOLE) << ES_SHIFT) EXTENT_STATUS_HOLE))
#define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1)))
struct ext4_sb_info; struct ext4_sb_info;
struct ext4_extent; struct ext4_extent;
@ -129,7 +135,7 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk, ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status); unsigned int status, int flags);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk, ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status); unsigned int status);
@ -156,7 +162,7 @@ static inline unsigned int ext4_es_status(struct extent_status *es)
static inline unsigned int ext4_es_type(struct extent_status *es) static inline unsigned int ext4_es_type(struct extent_status *es)
{ {
return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
} }
static inline int ext4_es_is_written(struct extent_status *es) static inline int ext4_es_is_written(struct extent_status *es)
@ -184,11 +190,6 @@ static inline int ext4_es_is_mapped(struct extent_status *es)
return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
} }
static inline int ext4_es_is_delonly(struct extent_status *es)
{
return (ext4_es_is_delayed(es) && !ext4_es_is_unwritten(es));
}
static inline void ext4_es_set_referenced(struct extent_status *es) static inline void ext4_es_set_referenced(struct extent_status *es)
{ {
es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
@ -224,17 +225,12 @@ static inline void ext4_es_store_pblock(struct extent_status *es,
es->es_pblk = block; es->es_pblk = block;
} }
static inline void ext4_es_store_status(struct extent_status *es,
unsigned int status)
{
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(es->es_pblk & ~ES_MASK);
}
static inline void ext4_es_store_pblock_status(struct extent_status *es, static inline void ext4_es_store_pblock_status(struct extent_status *es,
ext4_fsblk_t pb, ext4_fsblk_t pb,
unsigned int status) unsigned int status)
{ {
WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));
es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
(pb & ~ES_MASK); (pb & ~ES_MASK);
} }
@ -252,8 +248,6 @@ extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated, ext4_lblk_t len, bool lclu_allocated,
bool end_allocated); bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode); extern void ext4_clear_inode_es(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */ #endif /* _EXT4_EXTENTS_STATUS_H */

View File

@ -339,22 +339,29 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
tid_t tid; tid_t tid;
bool has_transaction = true;
bool is_ineligible;
if (ext4_fc_disabled(sb)) if (ext4_fc_disabled(sb))
return; return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (handle && !IS_ERR(handle)) if (handle && !IS_ERR(handle))
tid = handle->h_transaction->t_tid; tid = handle->h_transaction->t_tid;
else { else {
read_lock(&sbi->s_journal->j_state_lock); read_lock(&sbi->s_journal->j_state_lock);
tid = sbi->s_journal->j_running_transaction ? if (sbi->s_journal->j_running_transaction)
sbi->s_journal->j_running_transaction->t_tid : 0; tid = sbi->s_journal->j_running_transaction->t_tid;
else
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock); read_unlock(&sbi->s_journal->j_state_lock);
} }
spin_lock(&sbi->s_fc_lock); spin_lock(&sbi->s_fc_lock);
if (tid_gt(tid, sbi->s_fc_ineligible_tid)) is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction &&
(!is_ineligible ||
(is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
sbi->s_fc_ineligible_tid = tid; sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
spin_unlock(&sbi->s_fc_lock); spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX); WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
@ -1288,8 +1295,21 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list); list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode, ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING); EXT4_STATE_FC_COMMITTING);
if (tid_geq(tid, iter->i_sync_tid)) if (tid_geq(tid, iter->i_sync_tid)) {
ext4_fc_reset_inode(&iter->vfs_inode); ext4_fc_reset_inode(&iter->vfs_inode);
} else if (full) {
/*
* We are called after a full commit, inode has been
* modified while the commit was running. Re-enqueue
* the inode into STAGING, which will then be splice
* back into MAIN. This cannot happen during
* fastcommit because the journal is locked all the
* time in that case (and tid doesn't increase so
* tid check above isn't reliable).
*/
list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
}
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb(); smp_mb();
#if (BITS_PER_LONG < 64) #if (BITS_PER_LONG < 64)
@ -1772,7 +1792,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ret == 0) { if (ret == 0) {
/* Range is not mapped */ /* Range is not mapped */
path = ext4_find_extent(inode, cur, NULL, 0); path = ext4_find_extent(inode, cur, path, 0);
if (IS_ERR(path)) if (IS_ERR(path))
goto out; goto out;
memset(&newex, 0, sizeof(newex)); memset(&newex, 0, sizeof(newex));
@ -1783,11 +1803,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
if (ext4_ext_is_unwritten(ex)) if (ext4_ext_is_unwritten(ex))
ext4_ext_mark_unwritten(&newex); ext4_ext_mark_unwritten(&newex);
down_write(&EXT4_I(inode)->i_data_sem); down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_insert_extent( path = ext4_ext_insert_extent(NULL, inode,
NULL, inode, &path, &newex, 0); path, &newex, 0);
up_write((&EXT4_I(inode)->i_data_sem)); up_write((&EXT4_I(inode)->i_data_sem));
ext4_free_ext_path(path); if (IS_ERR(path))
if (ret)
goto out; goto out;
goto next; goto next;
} }
@ -1836,6 +1855,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits); sb->s_blocksize_bits);
out: out:
ext4_free_ext_path(path);
iput(inode); iput(inode);
return 0; return 0;
} }
@ -1936,12 +1956,13 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
break; break;
if (ret > 0) { if (ret > 0) {
path = ext4_find_extent(inode, map.m_lblk, NULL, 0); path = ext4_find_extent(inode, map.m_lblk, path, 0);
if (!IS_ERR(path)) { if (!IS_ERR(path)) {
for (j = 0; j < path->p_depth; j++) for (j = 0; j < path->p_depth; j++)
ext4_mb_mark_bb(inode->i_sb, ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, true); path[j].p_block, 1, true);
ext4_free_ext_path(path); } else {
path = NULL;
} }
cur += ret; cur += ret;
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
@ -1952,6 +1973,8 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
} }
iput(inode); iput(inode);
} }
ext4_free_ext_path(path);
} }
/* /*

View File

@ -306,7 +306,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
} }
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
ssize_t count) ssize_t written, ssize_t count)
{ {
handle_t *handle; handle_t *handle;
@ -315,7 +315,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
if (IS_ERR(handle)) if (IS_ERR(handle))
return PTR_ERR(handle); return PTR_ERR(handle);
if (ext4_update_inode_size(inode, offset + count)) { if (ext4_update_inode_size(inode, offset + written)) {
int ret = ext4_mark_inode_dirty(handle, inode); int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) { if (unlikely(ret)) {
ext4_journal_stop(handle); ext4_journal_stop(handle);
@ -323,21 +323,21 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
} }
} }
if (inode->i_nlink) if ((written == count) && inode->i_nlink)
ext4_orphan_del(handle, inode); ext4_orphan_del(handle, inode);
ext4_journal_stop(handle); ext4_journal_stop(handle);
return count; return written;
} }
/* /*
* Clean up the inode after DIO or DAX extending write has completed and the * Clean up the inode after DIO or DAX extending write has completed and the
* inode size has been updated using ext4_handle_inode_extension(). * inode size has been updated using ext4_handle_inode_extension().
*/ */
static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc)
{ {
lockdep_assert_held_write(&inode->i_rwsem); lockdep_assert_held_write(&inode->i_rwsem);
if (count < 0) { if (need_trunc) {
ext4_truncate_failed_write(inode); ext4_truncate_failed_write(inode);
/* /*
* If the truncate operation failed early, then the inode may * If the truncate operation failed early, then the inode may
@ -393,7 +393,7 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode)) pos + size <= i_size_read(inode))
return size; return size;
return ext4_handle_inode_extension(inode, pos, size); return ext4_handle_inode_extension(inode, pos, size, size);
} }
static const struct iomap_dio_ops ext4_dio_write_ops = { static const struct iomap_dio_ops ext4_dio_write_ops = {
@ -586,7 +586,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
* writeback of delalloc blocks. * writeback of delalloc blocks.
*/ */
WARN_ON_ONCE(ret == -EIOCBQUEUED); WARN_ON_ONCE(ret == -EIOCBQUEUED);
ext4_inode_extension_cleanup(inode, ret); ext4_inode_extension_cleanup(inode, ret < 0);
} }
out: out:
@ -669,8 +669,8 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
if (extend) { if (extend) {
ret = ext4_handle_inode_extension(inode, offset, ret); ret = ext4_handle_inode_extension(inode, offset, ret, count);
ext4_inode_extension_cleanup(inode, ret); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count);
} }
out: out:
inode_unlock(inode); inode_unlock(inode);

View File

@ -87,10 +87,10 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return 0; return 0;
grp = ext4_get_group_info(sb, block_group);
if (buffer_verified(bh)) if (buffer_verified(bh))
return 0; return 0;
grp = ext4_get_group_info(sb, block_group);
if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
return -EFSCORRUPTED; return -EFSCORRUPTED;
@ -98,8 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
if (buffer_verified(bh)) if (buffer_verified(bh))
goto verified; goto verified;
blk = ext4_inode_bitmap(sb, desc); blk = ext4_inode_bitmap(sb, desc);
if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, if (!ext4_inode_bitmap_csum_verify(sb, desc, bh) ||
EXT4_INODES_PER_GROUP(sb) / 8) ||
ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
ext4_unlock_group(sb, block_group); ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
@ -327,8 +326,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (percpu_counter_initialized(&sbi->s_dirs_counter)) if (percpu_counter_initialized(&sbi->s_dirs_counter))
percpu_counter_dec(&sbi->s_dirs_counter); percpu_counter_dec(&sbi->s_dirs_counter);
} }
ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp); ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group); ext4_unlock_group(sb, block_group);
@ -514,6 +512,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (min_inodes < 1) if (min_inodes < 1)
min_inodes = 1; min_inodes = 1;
min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4; min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
if (min_clusters < 0)
min_clusters = 0;
/* /*
* Start looking in the flex group where we last allocated an * Start looking in the flex group where we last allocated an
@ -755,10 +755,10 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
struct ext4_group_desc *gdp; struct ext4_group_desc *gdp;
ext4_group_t group; ext4_group_t group;
int bit; int bit;
int err = -EFSCORRUPTED; int err;
if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
goto out; return -EFSCORRUPTED;
group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
@ -772,7 +772,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
} }
gdp = ext4_get_group_desc(sb, group, &group_desc_bh); gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp || !group_desc_bh) { if (!gdp) {
err = -EINVAL; err = -EINVAL;
goto out; goto out;
} }
@ -851,8 +851,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (ext4_has_group_desc_csum(sb)) { if (ext4_has_group_desc_csum(sb)) {
ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, group, gdp); ext4_group_desc_csum_set(sb, group, gdp);
} }
@ -860,6 +859,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh); err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
sync_dirty_buffer(group_desc_bh); sync_dirty_buffer(group_desc_bh);
out: out:
brelse(inode_bitmap_bh);
return err; return err;
} }
@ -1053,14 +1053,14 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
brelse(inode_bitmap_bh); brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */ /* Skip groups with suspicious inode tables */
if (((!(sbi->s_mount_state & EXT4_FC_REPLAY)) if (IS_ERR(inode_bitmap_bh)) {
&& EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL; inode_bitmap_bh = NULL;
goto next_group; goto next_group;
} }
if (!(sbi->s_mount_state & EXT4_FC_REPLAY) &&
EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
repeat_in_this_group:
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2) if (!ret2)
goto next_group; goto next_group;
@ -1110,8 +1110,6 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
if (!ret2) if (!ret2)
goto got; /* we grabbed the inode! */ goto got; /* we grabbed the inode! */
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
next_group: next_group:
if (++group == ngroups) if (++group == ngroups)
group = 0; group = 0;
@ -1224,8 +1222,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
} }
} }
if (ext4_has_group_desc_csum(sb)) { if (ext4_has_group_desc_csum(sb)) {
ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, group, gdp); ext4_group_desc_csum_set(sb, group, gdp);
} }
ext4_unlock_group(sb, group); ext4_unlock_group(sb, group);

View File

@ -652,13 +652,6 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1); ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len; count = ar.len;
/*
* Update reserved blocks/metadata blocks after successful block
* allocation which had been deferred till now.
*/
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ext4_da_update_reserve_space(inode, count, 1);
got_it: got_it:
map->m_flags |= EXT4_MAP_MAPPED; map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key); map->m_pblk = le32_to_cpu(chain[depth-1].key);

View File

@ -601,10 +601,11 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
goto out; goto out;
if (ext4_should_dioread_nolock(inode)) { if (ext4_should_dioread_nolock(inode)) {
ret = __block_write_begin(folio, from, to, ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block_unwritten); ext4_get_block_unwritten);
} else } else
ret = __block_write_begin(folio, from, to, ext4_get_block); ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) { if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode, ret = ext4_walk_page_buffers(handle, inode,
@ -856,7 +857,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
goto out; goto out;
} }
ret = __block_write_begin(folio, 0, inline_size, ret = ext4_block_write_begin(NULL, folio, 0, inline_size,
ext4_da_get_block_prep); ext4_da_get_block_prep);
if (ret) { if (ret) {
up_read(&EXT4_I(inode)->xattr_sem); up_read(&EXT4_I(inode)->xattr_sem);
@ -1665,24 +1666,36 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir, struct ext4_dir_entry_2 **res_dir,
int *has_inline_data) int *has_inline_data)
{ {
struct ext4_xattr_ibody_find is = {
.s = { .not_found = -ENODATA, },
};
struct ext4_xattr_info i = {
.name_index = EXT4_XATTR_INDEX_SYSTEM,
.name = EXT4_XATTR_SYSTEM_DATA,
};
int ret; int ret;
struct ext4_iloc iloc;
void *inline_start; void *inline_start;
int inline_size; int inline_size;
if (ext4_get_inode_loc(dir, &iloc)) ret = ext4_get_inode_loc(dir, &is.iloc);
return NULL; if (ret)
return ERR_PTR(ret);
down_read(&EXT4_I(dir)->xattr_sem); down_read(&EXT4_I(dir)->xattr_sem);
ret = ext4_xattr_ibody_find(dir, &i, &is);
if (ret)
goto out;
if (!ext4_has_inline_data(dir)) { if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0; *has_inline_data = 0;
goto out; goto out;
} }
inline_start = (void *)ext4_raw_inode(&iloc)->i_block + inline_start = (void *)ext4_raw_inode(&is.iloc)->i_block +
EXT4_INLINE_DOTDOT_SIZE; EXT4_INLINE_DOTDOT_SIZE;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
ret = ext4_search_dir(iloc.bh, inline_start, inline_size, ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir); dir, fname, 0, res_dir);
if (ret == 1) if (ret == 1)
goto out_find; goto out_find;
@ -1692,20 +1705,23 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir,
if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
goto out; goto out;
inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_start = ext4_get_inline_xattr_pos(dir, &is.iloc);
inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
ret = ext4_search_dir(iloc.bh, inline_start, inline_size, ret = ext4_search_dir(is.iloc.bh, inline_start, inline_size,
dir, fname, 0, res_dir); dir, fname, 0, res_dir);
if (ret == 1) if (ret == 1)
goto out_find; goto out_find;
out: out:
brelse(iloc.bh); brelse(is.iloc.bh);
iloc.bh = NULL; if (ret < 0)
is.iloc.bh = ERR_PTR(ret);
else
is.iloc.bh = NULL;
out_find: out_find:
up_read(&EXT4_I(dir)->xattr_sem); up_read(&EXT4_I(dir)->xattr_sem);
return iloc.bh; return is.iloc.bh;
} }
int ext4_delete_inline_entry(handle_t *handle, int ext4_delete_inline_entry(handle_t *handle,

View File

@ -49,6 +49,11 @@
#include <trace/events/ext4.h> #include <trace/events/ext4.h>
static void ext4_journalled_zero_new_buffers(handle_t *handle,
struct inode *inode,
struct folio *folio,
unsigned from, unsigned to);
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei) struct ext4_inode_info *ei)
{ {
@ -478,7 +483,89 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ? status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len, ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status); map->m_pblk, status, 0);
return retval;
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct extent_status es;
unsigned int status;
int err, retval = 0;
/*
* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
* indicates that the blocks and quotas has already been
* checked when the data was copied into the page cache.
*/
if (map->m_flags & EXT4_MAP_DELAYED)
flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
/*
* Here we clear m_flags because after allocating an new extent,
* it will be set again.
*/
map->m_flags &= ~EXT4_MAP_FLAGS;
/*
* We need to check for EXT4 here because migrate could have
* changed the inode type in between.
*/
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
retval = ext4_ind_map_blocks(handle, inode, map, flags);
/*
* We allocated new blocks which will result in i_data's
* format changing. Force the migrate to fail by clearing
* migrate flags.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
if (retval <= 0)
return retval;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode %lu: "
"retval %d != map->m_len %d",
inode->i_ino, retval, map->m_len);
WARN_ON(1);
}
/*
* We have to zeroout blocks before inserting them into extent
* status tree. Otherwise someone could look them up there and
* use them before they are really zeroed. We also have to
* unmap metadata before zeroing as otherwise writeback can
* overwrite zeros with stale data from block device.
*/
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
map->m_len);
if (err)
return err;
}
/*
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
if (flags & EXT4_GET_BLOCKS_PRE_IO &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es))
return retval;
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status, flags);
return retval; return retval;
} }
@ -576,32 +663,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block. * file system block.
*/ */
down_read(&EXT4_I(inode)->i_data_sem); down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_map_query_blocks(handle, inode, map);
retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
if (retval > 0) {
unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode "
"%lu: retval %d != map->m_len %d",
inode->i_ino, retval, map->m_len);
WARN_ON(1);
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
}
up_read((&EXT4_I(inode)->i_data_sem)); up_read((&EXT4_I(inode)->i_data_sem));
found: found:
@ -630,12 +692,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval; return retval;
/*
* Here we clear m_flags because after allocating an new extent,
* it will be set again.
*/
map->m_flags &= ~EXT4_MAP_FLAGS;
/* /*
* New blocks allocate and/or writing to unwritten extent * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take * will possibly result in updating i_data, so we take
@ -643,76 +699,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* with create == 1 flag. * with create == 1 flag.
*/ */
down_write(&EXT4_I(inode)->i_data_sem); down_write(&EXT4_I(inode)->i_data_sem);
retval = ext4_map_create_blocks(handle, inode, map, flags);
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
/*
* We allocated new blocks which will result in
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
}
if (retval > 0) {
unsigned int status;
if (unlikely(retval != map->m_len)) {
ext4_warning(inode->i_sb,
"ES len assertion failed for inode "
"%lu: retval %d != map->m_len %d",
inode->i_ino, retval, map->m_len);
WARN_ON(1);
}
/*
* We have to zeroout blocks before inserting them into extent
* status tree. Otherwise someone could look them up there and
* use them before they are really zeroed. We also have to
* unmap metadata before zeroing as otherwise writeback can
* overwrite zeros with stale data from block device.
*/
if (flags & EXT4_GET_BLOCKS_ZERO &&
map->m_flags & EXT4_MAP_MAPPED &&
map->m_flags & EXT4_MAP_NEW) {
ret = ext4_issue_zeroout(inode, map->m_lblk,
map->m_pblk, map->m_len);
if (ret) {
retval = ret;
goto out_sem;
}
}
/*
* If the extent has been zeroed out, we don't need to update
* extent status tree.
*/
if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es))
goto out_sem;
}
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
!(status & EXTENT_STATUS_WRITTEN) &&
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
}
out_sem:
up_write((&EXT4_I(inode)->i_data_sem)); up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
ret = check_block_validity(inode, map); ret = check_block_validity(inode, map);
@ -1018,31 +1005,15 @@ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
int do_journal_get_write_access(handle_t *handle, struct inode *inode, int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh) struct buffer_head *bh)
{ {
int dirty = buffer_dirty(bh);
int ret;
if (!buffer_mapped(bh) || buffer_freed(bh)) if (!buffer_mapped(bh) || buffer_freed(bh))
return 0; return 0;
/*
* __block_write_begin() could have dirtied some buffers. Clean
* the dirty bit as jbd2_journal_get_write_access() could complain
* otherwise about fs integrity issues. Setting of the dirty bit
* by __block_write_begin() isn't a real problem here as we clear
* the bit before releasing a page lock and thus writeback cannot
* ever write the buffer.
*/
if (dirty)
clear_buffer_dirty(bh);
BUFFER_TRACE(bh, "get write access"); BUFFER_TRACE(bh, "get write access");
ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, return ext4_journal_get_write_access(handle, inode->i_sb, bh,
EXT4_JTR_NONE); EXT4_JTR_NONE);
if (!ret && dirty)
ret = ext4_dirty_journalled_data(handle, bh);
return ret;
} }
#ifdef CONFIG_FS_ENCRYPTION int ext4_block_write_begin(handle_t *handle, struct folio *folio,
static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, loff_t pos, unsigned len,
get_block_t *get_block) get_block_t *get_block)
{ {
unsigned from = pos & (PAGE_SIZE - 1); unsigned from = pos & (PAGE_SIZE - 1);
@ -1056,6 +1027,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
struct buffer_head *bh, *head, *wait[2]; struct buffer_head *bh, *head, *wait[2];
int nr_wait = 0; int nr_wait = 0;
int i; int i;
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio)); BUG_ON(!folio_test_locked(folio));
BUG_ON(from > PAGE_SIZE); BUG_ON(from > PAGE_SIZE);
@ -1085,10 +1057,22 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
if (err) if (err)
break; break;
if (buffer_new(bh)) { if (buffer_new(bh)) {
/*
* We may be zeroing partial buffers or all new
* buffers in case of failure. Prepare JBD2 for
* that.
*/
if (should_journal_data)
do_journal_get_write_access(handle,
inode, bh);
if (folio_test_uptodate(folio)) { if (folio_test_uptodate(folio)) {
clear_buffer_new(bh); /*
* Unlike __block_write_begin() we leave
* dirtying of new uptodate buffers to
* ->write_end() time or
* folio_zero_new_buffers().
*/
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue; continue;
} }
if (block_end > to || block_start < from) if (block_end > to || block_start < from)
@ -1118,6 +1102,10 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
err = -EIO; err = -EIO;
} }
if (unlikely(err)) { if (unlikely(err)) {
if (should_journal_data)
ext4_journalled_zero_new_buffers(handle, inode, folio,
from, to);
else
folio_zero_new_buffers(folio, from, to); folio_zero_new_buffers(folio, from, to);
} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
for (i = 0; i < nr_wait; i++) { for (i = 0; i < nr_wait; i++) {
@ -1134,7 +1122,6 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len,
return err; return err;
} }
#endif
/* /*
* To preserve ordering, it is essential that the hole instantiation and * To preserve ordering, it is essential that the hole instantiation and
@ -1216,19 +1203,12 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
/* In case writeback began while the folio was unlocked */ /* In case writeback began while the folio was unlocked */
folio_wait_stable(folio); folio_wait_stable(folio);
#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode)) if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(folio, pos, len, ret = ext4_block_write_begin(handle, folio, pos, len,
ext4_get_block_unwritten); ext4_get_block_unwritten);
else else
ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); ret = ext4_block_write_begin(handle, folio, pos, len,
#else ext4_get_block);
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(folio, pos, len,
ext4_get_block_unwritten);
else
ret = __block_write_begin(folio, pos, len, ext4_get_block);
#endif
if (!ret && ext4_should_journal_data(inode)) { if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode, ret = ext4_walk_page_buffers(handle, inode,
folio_buffers(folio), from, to, folio_buffers(folio), from, to,
@ -1241,7 +1221,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
folio_unlock(folio); folio_unlock(folio);
/* /*
* __block_write_begin may have instantiated a few blocks * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need * outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_rwsem. * i_size_read because we hold i_rwsem.
* *
@ -1388,9 +1368,9 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
size = min(to, block_end) - start; size = min(to, block_end) - start;
folio_zero_range(folio, start, size); folio_zero_range(folio, start, size);
write_end_fn(handle, inode, bh);
} }
clear_buffer_new(bh); clear_buffer_new(bh);
write_end_fn(handle, inode, bh);
} }
} }
block_start = block_end; block_start = block_end;
@ -1661,7 +1641,7 @@ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
int ret; int ret;
/* Has delalloc reservation? */ /* Has delalloc reservation? */
if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
return 1; return 1;
/* Already been allocated? */ /* Already been allocated? */
@ -1782,7 +1762,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
* Delayed extent could be allocated by fallocate. * Delayed extent could be allocated by fallocate.
* So we need to check it. * So we need to check it.
*/ */
if (ext4_es_is_delonly(&es)) { if (ext4_es_is_delayed(&es)) {
map->m_flags |= EXT4_MAP_DELAYED; map->m_flags |= EXT4_MAP_DELAYED;
return 0; return 0;
} }
@ -2217,11 +2197,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* writeback and there is nothing we can do about it so it might result * writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if * in data loss. So use reserved blocks to allocate metadata if
* possible. * possible.
*
* We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
* the blocks in question are delalloc blocks. This indicates
* that the blocks and quotas has already been checked when
* the data was copied into the page cache.
*/ */
get_blocks_flags = EXT4_GET_BLOCKS_CREATE | get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_METADATA_NOFAIL |
@ -2229,8 +2204,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
dioread_nolock = ext4_should_dioread_nolock(inode); dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock) if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (map->m_flags & BIT(BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
err = ext4_map_blocks(handle, inode, map, get_blocks_flags); err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0) if (err < 0)
@ -2959,11 +2932,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
if (IS_ERR(folio)) if (IS_ERR(folio))
return PTR_ERR(folio); return PTR_ERR(folio);
#ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(NULL, folio, pos, len,
ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); ext4_da_get_block_prep);
#else
ret = __block_write_begin(folio, pos, len, ext4_da_get_block_prep);
#endif
if (ret < 0) { if (ret < 0) {
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
@ -4067,7 +4037,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
stop_block); stop_block);
ext4_es_insert_extent(inode, first_block, hole_len, ~0, ext4_es_insert_extent(inode, first_block, hole_len, ~0,
EXTENT_STATUS_HOLE); EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem); up_write(&EXT4_I(inode)->i_data_sem);
} }
ext4_fc_track_range(handle, inode, first_block, stop_block); ext4_fc_track_range(handle, inode, first_block, stop_block);
@ -5276,8 +5246,9 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
{ {
unsigned offset; unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0; tid_t commit_tid;
int ret; int ret;
bool has_transaction;
offset = inode->i_size & (PAGE_SIZE - 1); offset = inode->i_size & (PAGE_SIZE - 1);
/* /*
@ -5302,12 +5273,14 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
folio_put(folio); folio_put(folio);
if (ret != -EBUSY) if (ret != -EBUSY)
return; return;
commit_tid = 0; has_transaction = false;
read_lock(&journal->j_state_lock); read_lock(&journal->j_state_lock);
if (journal->j_committing_transaction) if (journal->j_committing_transaction) {
commit_tid = journal->j_committing_transaction->t_tid; commit_tid = journal->j_committing_transaction->t_tid;
has_transaction = true;
}
read_unlock(&journal->j_state_lock); read_unlock(&journal->j_state_lock);
if (commit_tid) if (has_transaction)
jbd2_log_wait_commit(journal, commit_tid); jbd2_log_wait_commit(journal, commit_tid);
} }
} }
@ -6216,7 +6189,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
if (folio_pos(folio) + len > size) if (folio_pos(folio) + len > size)
len = size - folio_pos(folio); len = size - folio_pos(folio);
err = __block_write_begin(folio, 0, len, ext4_get_block); err = ext4_block_write_begin(handle, folio, 0, len,
ext4_get_block);
if (!err) { if (!err) {
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
if (ext4_journal_folio_buffers(handle, folio, len)) if (ext4_journal_folio_buffers(handle, folio, len))

View File

@ -2356,7 +2356,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ex.fe_logical = 0xDEADFA11; /* debug value */ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && if (max >= ac->ac_g_ex.fe_len &&
ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start; ext4_fsblk_t start;
start = ext4_grp_offs_to_block(ac->ac_sb, &ex); start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
@ -2553,7 +2553,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe); do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block; i = (a * sbi->s_stripe) - first_group_block;
stripe = EXT4_B2C(sbi, sbi->s_stripe); stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe);
i = EXT4_B2C(sbi, i); i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) { if (!mb_test_bit(i, bitmap)) {
@ -2928,9 +2928,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
if (cr == CR_POWER2_ALIGNED) if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b); ext4_mb_simple_scan_group(ac, &e4b);
else { else {
bool is_stripe_aligned = sbi->s_stripe && bool is_stripe_aligned =
(sbi->s_stripe >=
sbi->s_cluster_ratio) &&
!(ac->ac_g_ex.fe_len % !(ac->ac_g_ex.fe_len %
EXT4_B2C(sbi, sbi->s_stripe)); EXT4_NUM_B2C(sbi, sbi->s_stripe));
if ((cr == CR_GOAL_LEN_FAST || if ((cr == CR_GOAL_LEN_FAST ||
cr == CR_BEST_AVAIL_LEN) && cr == CR_BEST_AVAIL_LEN) &&
@ -3075,8 +3077,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
seq_puts(seq, " ]"); seq_puts(seq, " ]");
if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
seq_puts(seq, " Block bitmap corrupted!"); seq_puts(seq, " Block bitmap corrupted!");
seq_puts(seq, "\n"); seq_putc(seq, '\n');
return 0; return 0;
} }
@ -3707,7 +3708,7 @@ int ext4_mb_init(struct super_block *sb)
*/ */
if (sbi->s_stripe > 1) { if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc = roundup(
sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
} }
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@ -3887,10 +3888,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* /*
* Clear the trimmed flag for the group so that the next * Clear the trimmed flag for the group so that the next
* ext4_trim_fs can trim it. * ext4_trim_fs can trim it.
* If the volume is mounted with -o discard, online discard
* is supported and the free blocks will be trimmed online.
*/ */
if (!test_opt(sb, DISCARD))
EXT4_MB_GRP_CLEAR_TRIMMED(db); EXT4_MB_GRP_CLEAR_TRIMMED(db);
if (!db->bb_free_root.rb_node) { if (!db->bb_free_root.rb_node) {
@ -6515,7 +6513,8 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
" group:%u block:%d count:%lu failed" " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count, " with %d", block_group, bit, count,
err); err);
} else }
EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group); ext4_lock_group(sb, block_group);

View File

@ -37,7 +37,6 @@ static int finish_range(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, lb->first_block, NULL, 0); path = ext4_find_extent(inode, lb->first_block, NULL, 0);
if (IS_ERR(path)) { if (IS_ERR(path)) {
retval = PTR_ERR(path); retval = PTR_ERR(path);
path = NULL;
goto err_out; goto err_out;
} }
@ -53,7 +52,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0);
if (retval < 0) if (retval < 0)
goto err_out; goto err_out;
retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); path = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
if (IS_ERR(path))
retval = PTR_ERR(path);
err_out: err_out:
up_write((&EXT4_I(inode)->i_data_sem)); up_write((&EXT4_I(inode)->i_data_sem));
ext4_free_ext_path(path); ext4_free_ext_path(path);
@ -663,8 +664,8 @@ int ext4_ind_migrate(struct inode *inode)
if (unlikely(ret2 && !ret)) if (unlikely(ret2 && !ret))
ret = ret2; ret = ret2;
errout: errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem); up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
out_unlock: out_unlock:
ext4_writepages_up_write(inode->i_sb, alloc_ctx); ext4_writepages_up_write(inode->i_sb, alloc_ctx);
return ret; return ret;

View File

@ -17,27 +17,23 @@
* get_ext_path() - Find an extent path for designated logical block number. * get_ext_path() - Find an extent path for designated logical block number.
* @inode: inode to be searched * @inode: inode to be searched
* @lblock: logical block number to find an extent path * @lblock: logical block number to find an extent path
* @ppath: pointer to an extent path pointer (for output) * @path: pointer to an extent path
* *
* ext4_find_extent wrapper. Return 0 on success, or a negative error value * ext4_find_extent wrapper. Return an extent path pointer on success,
* on failure. * or an error pointer on failure.
*/ */
static inline int static inline struct ext4_ext_path *
get_ext_path(struct inode *inode, ext4_lblk_t lblock, get_ext_path(struct inode *inode, ext4_lblk_t lblock,
struct ext4_ext_path **ppath) struct ext4_ext_path *path)
{ {
struct ext4_ext_path *path; path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
if (IS_ERR(path)) if (IS_ERR(path))
return PTR_ERR(path); return path;
if (path[ext_depth(inode)].p_ext == NULL) { if (path[ext_depth(inode)].p_ext == NULL) {
ext4_free_ext_path(path); ext4_free_ext_path(path);
*ppath = NULL; return ERR_PTR(-ENODATA);
return -ENODATA;
} }
*ppath = path; return path;
return 0;
} }
/** /**
@ -95,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
int ret = 0; int ret = 0;
ext4_lblk_t last = from + count; ext4_lblk_t last = from + count;
while (from < last) { while (from < last) {
*err = get_ext_path(inode, from, &path); path = get_ext_path(inode, from, path);
if (*err) if (IS_ERR(path)) {
goto out; *err = PTR_ERR(path);
return ret;
}
ext = path[ext_depth(inode)].p_ext; ext = path[ext_depth(inode)].p_ext;
if (unwritten != ext4_ext_is_unwritten(ext)) if (unwritten != ext4_ext_is_unwritten(ext))
goto out; goto out;
@ -166,15 +164,16 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2,
return 0; return 0;
} }
/* Force page buffers uptodate w/o dropping page's lock */ /* Force folio buffers uptodate w/o dropping folio's lock */
static int static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
{ {
struct inode *inode = folio->mapping->host; struct inode *inode = folio->mapping->host;
sector_t block; sector_t block;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; struct buffer_head *bh, *head;
unsigned int blocksize, block_start, block_end; unsigned int blocksize, block_start, block_end;
int i, err, nr = 0, partial = 0; int nr = 0;
bool partial = false;
BUG_ON(!folio_test_locked(folio)); BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio)); BUG_ON(folio_test_writeback(folio));
@ -186,19 +185,21 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
if (!head) if (!head)
head = create_empty_buffers(folio, blocksize, 0); head = create_empty_buffers(folio, blocksize, 0);
block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); block = folio_pos(folio) >> inode->i_blkbits;
for (bh = head, block_start = 0; bh != head || !block_start; block_end = 0;
block++, block_start = block_end, bh = bh->b_this_page) { bh = head;
do {
block_start = block_end;
block_end = block_start + blocksize; block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) { if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh)) if (!buffer_uptodate(bh))
partial = 1; partial = true;
continue; continue;
} }
if (buffer_uptodate(bh)) if (buffer_uptodate(bh))
continue; continue;
if (!buffer_mapped(bh)) { if (!buffer_mapped(bh)) {
err = ext4_get_block(inode, block, bh, 0); int err = ext4_get_block(inode, block, bh, 0);
if (err) if (err)
return err; return err;
if (!buffer_mapped(bh)) { if (!buffer_mapped(bh)) {
@ -207,21 +208,30 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to)
continue; continue;
} }
} }
BUG_ON(nr >= MAX_BUF_PER_PAGE); lock_buffer(bh);
arr[nr++] = bh; if (buffer_uptodate(bh)) {
unlock_buffer(bh);
continue;
} }
ext4_read_bh_nowait(bh, 0, NULL);
nr++;
} while (block++, (bh = bh->b_this_page) != head);
/* No io required */ /* No io required */
if (!nr) if (!nr)
goto out; goto out;
for (i = 0; i < nr; i++) { bh = head;
bh = arr[i]; do {
if (!bh_uptodate_or_lock(bh)) { if (bh_offset(bh) + blocksize <= from)
err = ext4_read_bh(bh, 0, NULL); continue;
if (err) if (bh_offset(bh) > to)
return err; break;
} wait_on_buffer(bh);
} if (buffer_uptodate(bh))
continue;
return -EIO;
} while ((bh = bh->b_this_page) != head);
out: out:
if (!partial) if (!partial)
folio_mark_uptodate(folio); folio_mark_uptodate(folio);
@ -624,9 +634,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
int offset_in_page; int offset_in_page;
int unwritten, cur_len; int unwritten, cur_len;
ret = get_ext_path(orig_inode, o_start, &path); path = get_ext_path(orig_inode, o_start, path);
if (ret) if (IS_ERR(path)) {
ret = PTR_ERR(path);
goto out; goto out;
}
ex = path[path->p_depth].p_ext; ex = path[path->p_depth].p_ext;
cur_blk = le32_to_cpu(ex->ee_block); cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex); cur_len = ext4_ext_get_actual_len(ex);

View File

@ -1482,7 +1482,7 @@ static bool ext4_match(struct inode *parent,
} }
/* /*
* Returns 0 if not found, -1 on failure, and 1 on success * Returns 0 if not found, -EFSCORRUPTED on failure, and 1 on success
*/ */
int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
struct inode *dir, struct ext4_filename *fname, struct inode *dir, struct ext4_filename *fname,
@ -1503,7 +1503,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
* a full check */ * a full check */
if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf, if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
buf_size, offset)) buf_size, offset))
return -1; return -EFSCORRUPTED;
*res_dir = de; *res_dir = de;
return 1; return 1;
} }
@ -1511,7 +1511,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
de_len = ext4_rec_len_from_disk(de->rec_len, de_len = ext4_rec_len_from_disk(de->rec_len,
dir->i_sb->s_blocksize); dir->i_sb->s_blocksize);
if (de_len <= 0) if (de_len <= 0)
return -1; return -EFSCORRUPTED;
offset += de_len; offset += de_len;
de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
} }
@ -1574,7 +1574,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
&has_inline_data); &has_inline_data);
if (inlined) if (inlined)
*inlined = has_inline_data; *inlined = has_inline_data;
if (has_inline_data) if (has_inline_data || IS_ERR(ret))
goto cleanup_and_exit; goto cleanup_and_exit;
} }
@ -1663,9 +1663,11 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
goto cleanup_and_exit; goto cleanup_and_exit;
} else { } else {
brelse(bh); brelse(bh);
if (i < 0) if (i < 0) {
ret = ERR_PTR(i);
goto cleanup_and_exit; goto cleanup_and_exit;
} }
}
next: next:
if (++block >= nblocks) if (++block >= nblocks)
block = 0; block = 0;
@ -1758,7 +1760,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
if (retval == 1) if (retval == 1)
goto success; goto success;
brelse(bh); brelse(bh);
if (retval == -1) { if (retval < 0) {
bh = ERR_PTR(ERR_BAD_DX_DIR); bh = ERR_PTR(ERR_BAD_DX_DIR);
goto errout; goto errout;
} }
@ -1999,7 +2001,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
split = count/2; split = count/2;
hash2 = map[split].hash; hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash; continued = split > 0 ? hash2 == map[split - 1].hash : 0;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
(unsigned long)dx_get_block(frame->at), (unsigned long)dx_get_block(frame->at),
hash2, split, count-split)); hash2, split, count-split));

View File

@ -221,7 +221,7 @@ int ext4_mpage_readpages(struct inode *inode,
sector_t block_in_file; sector_t block_in_file;
sector_t last_block; sector_t last_block;
sector_t last_block_in_file; sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE]; sector_t first_block;
unsigned page_block; unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev; struct block_device *bdev = inode->i_sb->s_bdev;
int length; int length;
@ -263,6 +263,7 @@ int ext4_mpage_readpages(struct inode *inode,
unsigned map_offset = block_in_file - map.m_lblk; unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset; unsigned last = map.m_len - map_offset;
first_block = map.m_pblk + map_offset;
for (relative_block = 0; ; relative_block++) { for (relative_block = 0; ; relative_block++) {
if (relative_block == last) { if (relative_block == last) {
/* needed? */ /* needed? */
@ -271,8 +272,6 @@ int ext4_mpage_readpages(struct inode *inode,
} }
if (page_block == blocks_per_page) if (page_block == blocks_per_page)
break; break;
blocks[page_block] = map.m_pblk + map_offset +
relative_block;
page_block++; page_block++;
block_in_file++; block_in_file++;
} }
@ -307,7 +306,9 @@ int ext4_mpage_readpages(struct inode *inode,
goto confused; /* hole -> non-hole */ goto confused; /* hole -> non-hole */
/* Contiguous blocks? */ /* Contiguous blocks? */
if (page_block && blocks[page_block-1] != map.m_pblk-1) if (!page_block)
first_block = map.m_pblk;
else if (first_block + page_block != map.m_pblk)
goto confused; goto confused;
for (relative_block = 0; ; relative_block++) { for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) { if (relative_block == map.m_len) {
@ -316,7 +317,6 @@ int ext4_mpage_readpages(struct inode *inode,
break; break;
} else if (page_block == blocks_per_page) } else if (page_block == blocks_per_page)
break; break;
blocks[page_block] = map.m_pblk+relative_block;
page_block++; page_block++;
block_in_file++; block_in_file++;
} }
@ -339,7 +339,7 @@ int ext4_mpage_readpages(struct inode *inode,
* This folio will go to BIO. Do we need to send this * This folio will go to BIO. Do we need to send this
* BIO off first? * BIO off first?
*/ */
if (bio && (last_block_in_bio != blocks[0] - 1 || if (bio && (last_block_in_bio != first_block - 1 ||
!fscrypt_mergeable_bio(bio, inode, next_block))) { !fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc: submit_and_realloc:
submit_bio(bio); submit_bio(bio);
@ -355,7 +355,7 @@ int ext4_mpage_readpages(struct inode *inode,
fscrypt_set_bio_crypt_ctx(bio, inode, next_block, fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL); GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, folio->index); ext4_set_bio_post_read_ctx(bio, inode, folio->index);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_iter.bi_sector = first_block << (blkbits - 9);
bio->bi_end_io = mpage_end_io; bio->bi_end_io = mpage_end_io;
if (rac) if (rac)
bio->bi_opf |= REQ_RAHEAD; bio->bi_opf |= REQ_RAHEAD;
@ -371,7 +371,7 @@ int ext4_mpage_readpages(struct inode *inode,
submit_bio(bio); submit_bio(bio);
bio = NULL; bio = NULL;
} else } else
last_block_in_bio = blocks[blocks_per_page - 1]; last_block_in_bio = first_block + blocks_per_page - 1;
continue; continue;
confused: confused:
if (bio) { if (bio) {

View File

@ -1319,8 +1319,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
bh = ext4_get_bitmap(sb, group_data->inode_bitmap); bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
if (!bh) if (!bh)
return -EIO; return -EIO;
ext4_inode_bitmap_csum_set(sb, gdp, bh, ext4_inode_bitmap_csum_set(sb, gdp, bh);
EXT4_INODES_PER_GROUP(sb) / 8);
brelse(bh); brelse(bh);
bh = ext4_get_bitmap(sb, group_data->block_bitmap); bh = ext4_get_bitmap(sb, group_data->block_bitmap);

View File

@ -735,11 +735,12 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/* /*
* Make sure updated value of ->s_mount_flags will be visible before * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
* ->s_flags update * modifications. We don't set SB_RDONLY because that requires
* sb->s_umount semaphore and setting it without proper remount
* procedure is confusing code such as freeze_super() leading to
* deadlocks and other problems.
*/ */
smp_wmb();
sb->s_flags |= SB_RDONLY;
} }
static void update_super_work(struct work_struct *work) static void update_super_work(struct work_struct *work)
@ -3045,7 +3046,7 @@ int ext4_seq_options_show(struct seq_file *seq, void *offset)
seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
rc = _ext4_show_options(seq, sb, 1); rc = _ext4_show_options(seq, sb, 1);
seq_puts(seq, "\n"); seq_putc(seq, '\n');
return rc; return rc;
} }
@ -5087,16 +5088,27 @@ static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
return ret; return ret;
} }
static void ext4_hash_info_init(struct super_block *sb) static int ext4_hash_info_init(struct super_block *sb)
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es; struct ext4_super_block *es = sbi->s_es;
unsigned int i; unsigned int i;
sbi->s_def_hash_version = es->s_def_hash_version;
if (sbi->s_def_hash_version > DX_HASH_LAST) {
ext4_msg(sb, KERN_ERR,
"Invalid default hash set in the superblock");
return -EINVAL;
} else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) {
ext4_msg(sb, KERN_ERR,
"SIPHASH is not a valid default hash value");
return -EINVAL;
}
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
if (ext4_has_feature_dir_index(sb)) { if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags); i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH) if (i & EXT2_FLAGS_UNSIGNED_HASH)
@ -5114,6 +5126,7 @@ static void ext4_hash_info_init(struct super_block *sb)
#endif #endif
} }
} }
return 0;
} }
static int ext4_block_group_meta_init(struct super_block *sb, int silent) static int ext4_block_group_meta_init(struct super_block *sb, int silent)
@ -5165,6 +5178,18 @@ static int ext4_block_group_meta_init(struct super_block *sb, int silent)
return 0; return 0;
} }
/*
* It's hard to get stripe aligned blocks if stripe is not aligned with
* cluster, just disable stripe and alert user to simplify code and avoid
* stripe aligned allocation which will rarely succeed.
*/
static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
return (stripe > 0 && sbi->s_cluster_ratio > 1 &&
stripe % sbi->s_cluster_ratio != 0);
}
static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
{ {
struct ext4_super_block *es = NULL; struct ext4_super_block *es = NULL;
@ -5249,7 +5274,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err) if (err)
goto failed_mount; goto failed_mount;
ext4_hash_info_init(sb); err = ext4_hash_info_init(sb);
if (err)
goto failed_mount;
err = ext4_handle_clustersize(sb); err = ext4_handle_clustersize(sb);
if (err) if (err)
@ -5272,13 +5299,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3; goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_stripe = ext4_get_stripe_size(sbi);
/* if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) {
* It's hard to get stripe aligned blocks if stripe is not aligned with
* cluster, just disable stripe and alert user to simpfy code and avoid
* stripe aligned allocation which will rarely successes.
*/
if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
sbi->s_stripe % sbi->s_cluster_ratio != 0) {
ext4_msg(sb, KERN_WARNING, ext4_msg(sb, KERN_WARNING,
"stripe (%lu) is not aligned with cluster size (%u), " "stripe (%lu) is not aligned with cluster size (%u), "
"stripe is disabled", "stripe is disabled",
@ -5313,6 +5334,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_orphan_lock);
spin_lock_init(&sbi->s_bdev_wb_lock);
ext4_fast_commit_init(sb); ext4_fast_commit_init(sb);
sb->s_root = NULL; sb->s_root = NULL;
@ -5534,7 +5557,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* Save the original bdev mapping's wb_err value which could be * Save the original bdev mapping's wb_err value which could be
* used to detect the metadata async write error. * used to detect the metadata async write error.
*/ */
spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
&sbi->s_bdev_wb_err); &sbi->s_bdev_wb_err);
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
@ -5614,8 +5636,8 @@ failed_mount8: __maybe_unused
failed_mount3: failed_mount3:
/* flush s_sb_upd_work before sbi destroy */ /* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work); flush_work(&sbi->s_sb_upd_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi); ext4_stop_mmpd(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi); ext4_group_desc_free(sbi);
failed_mount: failed_mount:
if (sbi->s_chksum_driver) if (sbi->s_chksum_driver)
@ -6441,6 +6463,15 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
} }
if ((ctx->spec & EXT4_SPEC_s_stripe) &&
ext4_is_stripe_incompatible(sb, ctx->s_stripe)) {
ext4_msg(sb, KERN_WARNING,
"stripe (%lu) is not aligned with cluster size (%u), "
"stripe is disabled",
ctx->s_stripe, sbi->s_cluster_ratio);
ctx->s_stripe = 0;
}
/* /*
* Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
* two calls to ext4_should_dioread_nolock() to return inconsistent * two calls to ext4_should_dioread_nolock() to return inconsistent

View File

@ -458,7 +458,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE);
ext4_xattr_inode_set_ref(inode, 1); ext4_xattr_inode_set_ref(inode, 1);
} else { } else {
inode_lock(inode); inode_lock_nested(inode, I_MUTEX_XATTR);
inode->i_flags |= S_NOQUOTA; inode->i_flags |= S_NOQUOTA;
inode_unlock(inode); inode_unlock(inode);
} }
@ -1039,7 +1039,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
s64 ref_count; s64 ref_count;
int ret; int ret;
inode_lock(ea_inode); inode_lock_nested(ea_inode, I_MUTEX_XATTR);
ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc);
if (ret) if (ret)
@ -2879,11 +2879,9 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
if (*ea_inode_array == NULL) { if (*ea_inode_array == NULL) {
/* /*
* Start with 15 inodes, so it fits into a power-of-two size. * Start with 15 inodes, so it fits into a power-of-two size.
* If *ea_inode_array is NULL, this is essentially offsetof()
*/ */
(*ea_inode_array) = (*ea_inode_array) = kmalloc(
kmalloc(offsetof(struct ext4_xattr_inode_array, struct_size(*ea_inode_array, inodes, EIA_MASK),
inodes[EIA_MASK]),
GFP_NOFS); GFP_NOFS);
if (*ea_inode_array == NULL) if (*ea_inode_array == NULL)
return -ENOMEM; return -ENOMEM;
@ -2891,21 +2889,21 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
/* expand the array once all 15 + n * 16 slots are full */ /* expand the array once all 15 + n * 16 slots are full */
struct ext4_xattr_inode_array *new_array = NULL; struct ext4_xattr_inode_array *new_array = NULL;
int count = (*ea_inode_array)->count;
/* if new_array is NULL, this is essentially offsetof() */
new_array = kmalloc( new_array = kmalloc(
offsetof(struct ext4_xattr_inode_array, struct_size(*ea_inode_array, inodes,
inodes[count + EIA_INCR]), (*ea_inode_array)->count + EIA_INCR),
GFP_NOFS); GFP_NOFS);
if (new_array == NULL) if (new_array == NULL)
return -ENOMEM; return -ENOMEM;
memcpy(new_array, *ea_inode_array, memcpy(new_array, *ea_inode_array,
offsetof(struct ext4_xattr_inode_array, inodes[count])); struct_size(*ea_inode_array, inodes,
(*ea_inode_array)->count));
kfree(*ea_inode_array); kfree(*ea_inode_array);
*ea_inode_array = new_array; *ea_inode_array = new_array;
} }
(*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; (*ea_inode_array)->count++;
(*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
return 0; return 0;
} }
@ -3036,8 +3034,6 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
* *
* Create a new entry in the extended attribute block cache, and insert * Create a new entry in the extended attribute block cache, and insert
* it unless such an entry is already in the cache. * it unless such an entry is already in the cache.
*
* Returns 0, or a negative error number on failure.
*/ */
static void static void
ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
@ -3065,8 +3061,7 @@ ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache,
* *
* Compare two extended attribute blocks for equality. * Compare two extended attribute blocks for equality.
* *
* Returns 0 if the blocks are equal, 1 if they differ, and * Returns 0 if the blocks are equal, 1 if they differ.
* a negative error number on errors.
*/ */
static int static int
ext4_xattr_cmp(struct ext4_xattr_header *header1, ext4_xattr_cmp(struct ext4_xattr_header *header1,

View File

@ -32,8 +32,7 @@ struct ext4_xattr_header {
__le32 h_refcount; /* reference count */ __le32 h_refcount; /* reference count */
__le32 h_blocks; /* number of disk blocks used */ __le32 h_blocks; /* number of disk blocks used */
__le32 h_hash; /* hash value of all attributes */ __le32 h_hash; /* hash value of all attributes */
__le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ __le32 h_checksum; /* crc32c(uuid+blknum+xattrblock) */
/* id = inum if refcount=1, blknum otherwise */
__u32 h_reserved[3]; /* zero right now */ __u32 h_reserved[3]; /* zero right now */
}; };
@ -130,8 +129,8 @@ struct ext4_xattr_ibody_find {
}; };
struct ext4_xattr_inode_array { struct ext4_xattr_inode_array {
unsigned int count; /* # of used items in the array */ unsigned int count;
struct inode *inodes[]; struct inode *inodes[] __counted_by(count);
}; };
extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_user_handler;

View File

@ -79,17 +79,23 @@ __releases(&journal->j_state_lock)
if (space_left < nblocks) { if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL; int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0; tid_t tid = 0;
bool has_transaction = false;
if (journal->j_committing_transaction) if (journal->j_committing_transaction) {
tid = journal->j_committing_transaction->t_tid; tid = journal->j_committing_transaction->t_tid;
has_transaction = true;
}
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock); write_unlock(&journal->j_state_lock);
if (chkpt) { if (chkpt) {
jbd2_log_do_checkpoint(journal); jbd2_log_do_checkpoint(journal);
} else if (jbd2_cleanup_journal_tail(journal) == 0) { } else if (jbd2_cleanup_journal_tail(journal) <= 0) {
/* We were able to recover space; yay! */ /*
* We were able to recover space or the
* journal was aborted due to an error.
*/
; ;
} else if (tid) { } else if (has_transaction) {
/* /*
* jbd2_journal_commit_transaction() may want * jbd2_journal_commit_transaction() may want
* to take the checkpoint_mutex if JBD2_FLUSHED * to take the checkpoint_mutex if JBD2_FLUSHED
@ -407,6 +413,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
tid_t tid = 0; tid_t tid = 0;
unsigned long nr_freed = 0; unsigned long nr_freed = 0;
unsigned long freed; unsigned long freed;
bool first_set = false;
again: again:
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
@ -426,8 +433,10 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
else else
transaction = journal->j_checkpoint_transactions; transaction = journal->j_checkpoint_transactions;
if (!first_tid) if (!first_set) {
first_tid = transaction->t_tid; first_tid = transaction->t_tid;
first_set = true;
}
last_transaction = journal->j_checkpoint_transactions->t_cpprev; last_transaction = journal->j_checkpoint_transactions->t_cpprev;
next_transaction = transaction; next_transaction = transaction;
last_tid = last_transaction->t_tid; last_tid = last_transaction->t_tid;
@ -457,7 +466,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
cond_resched(); cond_resched();
if (*nr_to_scan && next_tid) if (*nr_to_scan && journal->j_shrink_transaction)
goto again; goto again;
out: out:
trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,

View File

@ -281,6 +281,16 @@ static void journal_kill_thread(journal_t *journal)
write_unlock(&journal->j_state_lock); write_unlock(&journal->j_state_lock);
} }
static inline bool jbd2_data_needs_escaping(char *data)
{
return *((__be32 *)data) == cpu_to_be32(JBD2_MAGIC_NUMBER);
}
static inline void jbd2_data_do_escape(char *data)
{
*((unsigned int *)data) = 0;
}
/* /*
* jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal. * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
* *
@ -318,9 +328,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out, struct buffer_head **bh_out,
sector_t blocknr) sector_t blocknr)
{ {
int done_copy_out = 0;
int do_escape = 0; int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh; struct buffer_head *new_bh;
struct folio *new_folio; struct folio *new_folio;
unsigned int new_offset; unsigned int new_offset;
@ -349,37 +357,33 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
* we use that version of the data for the commit. * we use that version of the data for the commit.
*/ */
if (jh_in->b_frozen_data) { if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_folio = virt_to_folio(jh_in->b_frozen_data); new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
do_escape = jbd2_data_needs_escaping(jh_in->b_frozen_data);
if (do_escape)
jbd2_data_do_escape(jh_in->b_frozen_data);
} else { } else {
char *tmp;
char *mapped_data;
new_folio = bh_in->b_folio; new_folio = bh_in->b_folio;
new_offset = offset_in_folio(new_folio, bh_in->b_data); new_offset = offset_in_folio(new_folio, bh_in->b_data);
}
mapped_data = kmap_local_folio(new_folio, new_offset); mapped_data = kmap_local_folio(new_folio, new_offset);
/* /*
* Fire data frozen trigger if data already wasn't frozen. Do this * Fire data frozen trigger if data already wasn't frozen. Do
* before checking for escaping, as the trigger may modify the magic * this before checking for escaping, as the trigger may modify
* offset. If a copy-out happens afterwards, it will have the correct * the magic offset. If a copy-out happens afterwards, it will
* data in the buffer. * have the correct data in the buffer.
*/ */
if (!done_copy_out)
jbd2_buffer_frozen_trigger(jh_in, mapped_data, jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers); jh_in->b_triggers);
do_escape = jbd2_data_needs_escaping(mapped_data);
/*
* Check for escaping
*/
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
do_escape = 1;
kunmap_local(mapped_data); kunmap_local(mapped_data);
/* /*
* Do we need to do a data copy? * Do we need to do a data copy?
*/ */
if (do_escape && !done_copy_out) { if (!do_escape)
char *tmp; goto escape_done;
spin_unlock(&jh_in->b_state_lock); spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
@ -406,18 +410,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
copy_done: copy_done:
new_folio = virt_to_folio(jh_in->b_frozen_data); new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
done_copy_out = 1; jbd2_data_do_escape(jh_in->b_frozen_data);
} }
/* escape_done:
* Did we need to do an escaping? Now we've done all the
* copying, we can finally do so.
* b_frozen_data is from jbd2_alloc() which always provides an
* address from the direct kernels mapping.
*/
if (do_escape)
*((unsigned int *)jh_in->b_frozen_data) = 0;
folio_set_bh(new_bh, new_folio, new_offset); folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size; new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev; new_bh->b_bdev = journal->j_dev;
@ -710,7 +706,7 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
return -EINVAL; return -EINVAL;
write_lock(&journal->j_state_lock); write_lock(&journal->j_state_lock);
if (tid <= journal->j_commit_sequence) { if (tid_geq(journal->j_commit_sequence, tid)) {
write_unlock(&journal->j_state_lock); write_unlock(&journal->j_state_lock);
return -EALREADY; return -EALREADY;
} }
@ -740,9 +736,9 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit);
*/ */
static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{ {
jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback) if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid); journal->j_fc_cleanup_callback(journal, 0, tid);
jbd2_journal_unlock_updates(journal);
write_lock(&journal->j_state_lock); write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback) if (fallback)
@ -841,17 +837,12 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
*bh_out = NULL; *bh_out = NULL;
if (journal->j_fc_off + journal->j_fc_first < journal->j_fc_last) { if (journal->j_fc_off + journal->j_fc_first >= journal->j_fc_last)
return -EINVAL;
fc_off = journal->j_fc_off; fc_off = journal->j_fc_off;
blocknr = journal->j_fc_first + fc_off; blocknr = journal->j_fc_first + fc_off;
journal->j_fc_off++; journal->j_fc_off++;
} else {
ret = -EINVAL;
}
if (ret)
return ret;
ret = jbd2_journal_bmap(journal, blocknr, &pblock); ret = jbd2_journal_bmap(journal, blocknr, &pblock);
if (ret) if (ret)
return ret; return ret;
@ -860,7 +851,6 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out)
if (!bh) if (!bh)
return -ENOMEM; return -ENOMEM;
journal->j_fc_wbuf[fc_off] = bh; journal->j_fc_wbuf[fc_off] = bh;
*bh_out = bh; *bh_out = bh;
@ -903,7 +893,7 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks)
} }
EXPORT_SYMBOL(jbd2_fc_wait_bufs); EXPORT_SYMBOL(jbd2_fc_wait_bufs);
int jbd2_fc_release_bufs(journal_t *journal) void jbd2_fc_release_bufs(journal_t *journal)
{ {
struct buffer_head *bh; struct buffer_head *bh;
int i, j_fc_off; int i, j_fc_off;
@ -917,8 +907,6 @@ int jbd2_fc_release_bufs(journal_t *journal)
put_bh(bh); put_bh(bh);
journal->j_fc_wbuf[i] = NULL; journal->j_fc_wbuf[i] = NULL;
} }
return 0;
} }
EXPORT_SYMBOL(jbd2_fc_release_bufs); EXPORT_SYMBOL(jbd2_fc_release_bufs);
@ -1944,7 +1932,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags)
if (had_fast_commit) if (had_fast_commit)
jbd2_set_feature_fast_commit(journal); jbd2_set_feature_fast_commit(journal);
/* Log is no longer empty */ /* Log is empty */
write_lock(&journal->j_state_lock); write_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_FLUSHED; journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock); write_unlock(&journal->j_state_lock);
@ -2866,7 +2854,6 @@ static struct journal_head *journal_alloc_journal_head(void)
ret = kmem_cache_zalloc(jbd2_journal_head_cache, ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL); GFP_NOFS | __GFP_NOFAIL);
} }
if (ret)
spin_lock_init(&ret->b_state_lock); spin_lock_init(&ret->b_state_lock);
return ret; return ret;
} }

View File

@ -1086,7 +1086,7 @@ struct journal_s
int j_revoke_records_per_block; int j_revoke_records_per_block;
/** /**
* @j_transaction_overhead: * @j_transaction_overhead_buffers:
* *
* Number of blocks each transaction needs for its own bookkeeping * Number of blocks each transaction needs for its own bookkeeping
*/ */
@ -1675,7 +1675,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out);
int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal); void jbd2_fc_release_bufs(journal_t *journal);
/* /*
* is_journal_abort * is_journal_abort