mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-01 10:42:11 +00:00
0d227466be
When we log modifications based on intents, we add both intent and intent done items to the modification being made. These get written to the log to ensure that the operation is re-run if the intent done is not found in the log. However, for operations that complete wholly within a single checkpoint, the change in the checkpoint is atomic and will never need replay. In this case, we don't need to actually write the intent and intent done items to the journal because log recovery will never need to manually restart this modification. Log recovery currently handles intent/intent done matching by inserting the intent into the AIL, then removing it when a matching intent done item is found. Hence for all the intent-based operations that complete within a checkpoint, we spend all that time parsing the intent/intent done items just to cancel them and do nothing with them. Hence it follows that the only time we actually need intents in the log is when the modification crosses checkpoint boundaries in the log and so may only be partially complete in the journal. Hence if we commit and intent done item to the CIL and the intent item is in the same checkpoint, we don't actually have to write them to the journal because log recovery will always cancel the intents. We've never really worried about the overhead of logging intents unnecessarily like this because the intents we log are generally very much smaller than the change being made. e.g. freeing an extent involves modifying at lease two freespace btree blocks and the AGF, so the EFI/EFD overhead is only a small increase in space and processing time compared to the overall cost of freeing an extent. However, delayed attributes change this cost equation dramatically, especially for inline attributes. In the case of adding an inline attribute, we only log the inode core and attribute fork at present. With delayed attributes, we now log the attr intent which includes the name and value, the inode core adn attr fork, and finally the attr intent done item. We increase the number of items we log from 1 to 3, and the number of log vectors (regions) goes up from 3 to 7. Hence we tripple the number of objects that the CIL has to process, and more than double the number of log vectors that need to be written to the journal. At scale, this means delayed attributes cause a non-pipelined CIL to become CPU bound processing all the extra items, resulting in a > 40% performance degradation on 16-way file+xattr create worklaods. Pipelining the CIL (as per 5.15) reduces the performance degradation to 20%, but now the limitation is the rate at which the log items can be written to the iclogs and iclogs be dispatched for IO and completed. Even log IO completion is slowed down by these intents, because it now has to process 3x the number of items in the checkpoint. Processing completed intents is especially inefficient here, because we first insert the intent into the AIL, then remove it from the AIL when the intent done is processed. IOWs, we are also doing expensive operations in log IO completion we could completely avoid if we didn't log completed intent/intent done pairs. Enter log item whiteouts. When an intent done is committed, we can check to see if the associated intent is in the same checkpoint as we are currently committing the intent done to. If so, we can mark the intent log item with a whiteout and immediately free the intent done item rather than committing it to the CIL. We can basically skip the entire formatting and CIL insertion steps for the intent done item. However, we cannot remove the intent item from the CIL at this point because the unlocked per-cpu CIL item lists do not permit removal without holding the CIL context lock exclusively. Transaction commit only holds the context lock shared, hence the best we can do is mark the intent item with a whiteout so that the CIL push can release it rather than writing it to the log. This means we never write the intent to the log if the intent done has also been committed to the same checkpoint, but we'll always write the intent if the intent done has not been committed or has been committed to a different checkpoint. This will result in correct log recovery behaviour in all cases, without the overhead of logging unnecessary intents. This intent whiteout concept is generic - we can apply it to all intent/intent done pairs that have a direct 1:1 relationship. The way deferred ops iterate and relog intents mean that all intents currently have a 1:1 relationship with their done intent, and hence we can apply this cancellation to all existing intent/intent done implementations. For delayed attributes with a 16-way 64kB xattr create workload, whiteouts reduce the amount of journalled metadata from ~2.5GB/s down to ~600MB/s and improve the creation rate from 9000/s to 14000/s. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
302 lines
9.7 KiB
C
302 lines
9.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
|
|
* All Rights Reserved.
|
|
*/
|
|
#ifndef __XFS_TRANS_H__
|
|
#define __XFS_TRANS_H__
|
|
|
|
/* kernel only transaction subsystem defines */
|
|
|
|
struct xlog;
|
|
struct xfs_buf;
|
|
struct xfs_buftarg;
|
|
struct xfs_efd_log_item;
|
|
struct xfs_efi_log_item;
|
|
struct xfs_inode;
|
|
struct xfs_item_ops;
|
|
struct xfs_log_iovec;
|
|
struct xfs_mount;
|
|
struct xfs_trans;
|
|
struct xfs_trans_res;
|
|
struct xfs_dquot_acct;
|
|
struct xfs_rud_log_item;
|
|
struct xfs_rui_log_item;
|
|
struct xfs_btree_cur;
|
|
struct xfs_cui_log_item;
|
|
struct xfs_cud_log_item;
|
|
struct xfs_bui_log_item;
|
|
struct xfs_bud_log_item;
|
|
|
|
struct xfs_log_item {
|
|
struct list_head li_ail; /* AIL pointers */
|
|
struct list_head li_trans; /* transaction list */
|
|
xfs_lsn_t li_lsn; /* last on-disk lsn */
|
|
struct xlog *li_log;
|
|
struct xfs_ail *li_ailp; /* ptr to AIL */
|
|
uint li_type; /* item type */
|
|
unsigned long li_flags; /* misc flags */
|
|
struct xfs_buf *li_buf; /* real buffer pointer */
|
|
struct list_head li_bio_list; /* buffer item list */
|
|
const struct xfs_item_ops *li_ops; /* function list */
|
|
|
|
/* delayed logging */
|
|
struct list_head li_cil; /* CIL pointers */
|
|
struct xfs_log_vec *li_lv; /* active log vector */
|
|
struct xfs_log_vec *li_lv_shadow; /* standby vector */
|
|
xfs_csn_t li_seq; /* CIL commit seq */
|
|
};
|
|
|
|
/*
|
|
* li_flags use the (set/test/clear)_bit atomic interfaces because updates can
|
|
* race with each other and we don't want to have to use the AIL lock to
|
|
* serialise all updates.
|
|
*/
|
|
#define XFS_LI_IN_AIL 0
|
|
#define XFS_LI_ABORTED 1
|
|
#define XFS_LI_FAILED 2
|
|
#define XFS_LI_DIRTY 3
|
|
#define XFS_LI_WHITEOUT 4
|
|
|
|
#define XFS_LI_FLAGS \
|
|
{ (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
|
|
{ (1u << XFS_LI_ABORTED), "ABORTED" }, \
|
|
{ (1u << XFS_LI_FAILED), "FAILED" }, \
|
|
{ (1u << XFS_LI_DIRTY), "DIRTY" }, \
|
|
{ (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
|
|
|
|
struct xfs_item_ops {
|
|
unsigned flags;
|
|
void (*iop_size)(struct xfs_log_item *, int *, int *);
|
|
void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
|
|
void (*iop_pin)(struct xfs_log_item *);
|
|
void (*iop_unpin)(struct xfs_log_item *, int remove);
|
|
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
|
|
void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
|
|
void (*iop_release)(struct xfs_log_item *);
|
|
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
|
|
int (*iop_recover)(struct xfs_log_item *lip,
|
|
struct list_head *capture_list);
|
|
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
|
|
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
|
|
struct xfs_trans *tp);
|
|
struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
|
|
};
|
|
|
|
/*
|
|
* Log item ops flags
|
|
*/
|
|
/*
|
|
* Release the log item when the journal commits instead of inserting into the
|
|
* AIL for writeback tracking and/or log tail pinning.
|
|
*/
|
|
#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
|
|
#define XFS_ITEM_INTENT (1 << 1)
|
|
#define XFS_ITEM_INTENT_DONE (1 << 2)
|
|
|
|
static inline bool
|
|
xlog_item_is_intent(struct xfs_log_item *lip)
|
|
{
|
|
return lip->li_ops->flags & XFS_ITEM_INTENT;
|
|
}
|
|
|
|
static inline bool
|
|
xlog_item_is_intent_done(struct xfs_log_item *lip)
|
|
{
|
|
return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
|
|
}
|
|
|
|
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
|
|
int type, const struct xfs_item_ops *ops);
|
|
|
|
/*
|
|
* Return values for the iop_push() routines.
|
|
*/
|
|
#define XFS_ITEM_SUCCESS 0
|
|
#define XFS_ITEM_PINNED 1
|
|
#define XFS_ITEM_LOCKED 2
|
|
#define XFS_ITEM_FLUSHING 3
|
|
|
|
/*
|
|
* This is the structure maintained for every active transaction.
|
|
*/
|
|
typedef struct xfs_trans {
|
|
unsigned int t_magic; /* magic number */
|
|
unsigned int t_log_res; /* amt of log space resvd */
|
|
unsigned int t_log_count; /* count for perm log res */
|
|
unsigned int t_blk_res; /* # of blocks resvd */
|
|
unsigned int t_blk_res_used; /* # of resvd blocks used */
|
|
unsigned int t_rtx_res; /* # of rt extents resvd */
|
|
unsigned int t_rtx_res_used; /* # of resvd rt extents used */
|
|
unsigned int t_flags; /* misc flags */
|
|
xfs_fsblock_t t_firstblock; /* first block allocated */
|
|
struct xlog_ticket *t_ticket; /* log mgr ticket */
|
|
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
|
|
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
|
|
int64_t t_icount_delta; /* superblock icount change */
|
|
int64_t t_ifree_delta; /* superblock ifree change */
|
|
int64_t t_fdblocks_delta; /* superblock fdblocks chg */
|
|
int64_t t_res_fdblocks_delta; /* on-disk only chg */
|
|
int64_t t_frextents_delta;/* superblock freextents chg*/
|
|
int64_t t_res_frextents_delta; /* on-disk only chg */
|
|
int64_t t_dblocks_delta;/* superblock dblocks change */
|
|
int64_t t_agcount_delta;/* superblock agcount change */
|
|
int64_t t_imaxpct_delta;/* superblock imaxpct change */
|
|
int64_t t_rextsize_delta;/* superblock rextsize chg */
|
|
int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
|
|
int64_t t_rblocks_delta;/* superblock rblocks change */
|
|
int64_t t_rextents_delta;/* superblocks rextents chg */
|
|
int64_t t_rextslog_delta;/* superblocks rextslog chg */
|
|
struct list_head t_items; /* log item descriptors */
|
|
struct list_head t_busy; /* list of busy extents */
|
|
struct list_head t_dfops; /* deferred operations */
|
|
unsigned long t_pflags; /* saved process flags state */
|
|
} xfs_trans_t;
|
|
|
|
/*
|
|
* XFS transaction mechanism exported interfaces that are
|
|
* actually macros.
|
|
*/
|
|
#define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC)
|
|
|
|
/*
|
|
* XFS transaction mechanism exported interfaces.
|
|
*/
|
|
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
|
|
uint blocks, uint rtextents, uint flags,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_empty(struct xfs_mount *mp,
|
|
struct xfs_trans **tpp);
|
|
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
|
|
|
|
int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
|
|
struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp);
|
|
|
|
static inline int
|
|
xfs_trans_get_buf(
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
xfs_daddr_t blkno,
|
|
int numblks,
|
|
xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp)
|
|
{
|
|
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
|
|
return xfs_trans_get_buf_map(tp, target, &map, 1, flags, bpp);
|
|
}
|
|
|
|
int xfs_trans_read_buf_map(struct xfs_mount *mp,
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
struct xfs_buf_map *map, int nmaps,
|
|
xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp,
|
|
const struct xfs_buf_ops *ops);
|
|
|
|
static inline int
|
|
xfs_trans_read_buf(
|
|
struct xfs_mount *mp,
|
|
struct xfs_trans *tp,
|
|
struct xfs_buftarg *target,
|
|
xfs_daddr_t blkno,
|
|
int numblks,
|
|
xfs_buf_flags_t flags,
|
|
struct xfs_buf **bpp,
|
|
const struct xfs_buf_ops *ops)
|
|
{
|
|
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
|
|
return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
|
|
flags, bpp, ops);
|
|
}
|
|
|
|
struct xfs_buf *xfs_trans_getsb(struct xfs_trans *);
|
|
|
|
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
|
|
bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
|
|
void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
|
|
void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
|
|
void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
|
|
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
|
|
uint);
|
|
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
|
|
bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
|
|
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
|
|
|
|
int xfs_trans_commit(struct xfs_trans *);
|
|
int xfs_trans_roll(struct xfs_trans **);
|
|
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
|
|
void xfs_trans_cancel(xfs_trans_t *);
|
|
int xfs_trans_ail_init(struct xfs_mount *);
|
|
void xfs_trans_ail_destroy(struct xfs_mount *);
|
|
|
|
void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
|
|
enum xfs_blft);
|
|
void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
|
|
struct xfs_buf *src_bp);
|
|
|
|
extern struct kmem_cache *xfs_trans_cache;
|
|
|
|
static inline struct xfs_log_item *
|
|
xfs_trans_item_relog(
|
|
struct xfs_log_item *lip,
|
|
struct xfs_trans *tp)
|
|
{
|
|
return lip->li_ops->iop_relog(lip, tp);
|
|
}
|
|
|
|
struct xfs_dquot;
|
|
|
|
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
|
|
unsigned int dblocks, unsigned int rblocks, bool force,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
|
|
struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
|
|
struct xfs_dquot *pdqp, unsigned int dblocks,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
|
|
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
|
|
struct xfs_trans **tpp);
|
|
int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
|
|
struct xfs_inode *ip, unsigned int *dblocks,
|
|
struct xfs_trans **tpp, int *nospace_error);
|
|
|
|
static inline void
|
|
xfs_trans_set_context(
|
|
struct xfs_trans *tp)
|
|
{
|
|
ASSERT(current->journal_info == NULL);
|
|
tp->t_pflags = memalloc_nofs_save();
|
|
current->journal_info = tp;
|
|
}
|
|
|
|
static inline void
|
|
xfs_trans_clear_context(
|
|
struct xfs_trans *tp)
|
|
{
|
|
if (current->journal_info == tp) {
|
|
memalloc_nofs_restore(tp->t_pflags);
|
|
current->journal_info = NULL;
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
xfs_trans_switch_context(
|
|
struct xfs_trans *old_tp,
|
|
struct xfs_trans *new_tp)
|
|
{
|
|
ASSERT(current->journal_info == old_tp);
|
|
new_tp->t_pflags = old_tp->t_pflags;
|
|
old_tp->t_pflags = 0;
|
|
current->journal_info = new_tp;
|
|
}
|
|
|
|
#endif /* __XFS_TRANS_H__ */
|