diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index fdedf1ea944b..3e1630c70d8a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -311,10 +311,13 @@ Description: Do background GC aggressively when set. Set to 0 by default. GC approach and turns SSR mode on. gc urgent low(2): lowers the bar of checking I/O idling in order to process outstanding discard commands and GC a - little bit aggressively. uses cost benefit GC approach. + little bit aggressively. always uses cost benefit GC approach, + and will override age-threshold GC approach if ATGC is enabled + at the same time. gc urgent mid(3): does GC forcibly in a period of given gc_urgent_sleep_time and executes a mid level of I/O idling check. - uses cost benefit GC approach. + always uses cost benefit GC approach, and will override + age-threshold GC approach if ATGC is enabled at the same time. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 @@ -819,3 +822,9 @@ Description: It controls the valid block ratio threshold not to trigger excessiv for zoned deivces. The initial value of it is 95(%). F2FS will stop the background GC thread from intiating GC for sections having valid blocks exceeding the ratio. + +What: /sys/fs/f2fs//max_read_extent_count +Date: November 2024 +Contact: "Chao Yu" +Description: It controls max read extent count for per-inode, the value of threshold + is 10240 by default. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 68a0885fb5e6..fb7d2ee022bc 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -943,3 +943,47 @@ NVMe Zoned Namespace devices can start before the zone-capacity and span across zone-capacity boundary. Such spanning segments are also considered as usable segments. All blocks past the zone-capacity are considered unusable in these segments. + +Device aliasing feature +----------------------- + +f2fs can utilize a special file called a "device aliasing file." This file allows +the entire storage device to be mapped with a single, large extent, not using +the usual f2fs node structures. This mapped area is pinned and primarily intended +for holding the space. + +Essentially, this mechanism allows a portion of the f2fs area to be temporarily +reserved and used by another filesystem or for different purposes. Once that +external usage is complete, the device aliasing file can be deleted, releasing +the reserved space back to F2FS for its own use. + + + +# ls /dev/vd* +/dev/vdb (32GB) /dev/vdc (32GB) +# mkfs.ext4 /dev/vdc +# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb +# mount /dev/vdb /mnt/f2fs +# ls -l /mnt/f2fs +vdc.file +# df -h +/dev/vdb 64G 33G 32G 52% /mnt/f2fs + +# mount -o loop /dev/vdc /mnt/ext4 +# df -h +/dev/vdb 64G 33G 32G 52% /mnt/f2fs +/dev/loop7 32G 24K 30G 1% /mnt/ext4 +# umount /mnt/ext4 + +# f2fs_io getflags /mnt/f2fs/vdc.file +get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable +# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file +get a flag on noimmutable ret=0, flags=800010 +set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable +# rm /mnt/f2fs/vdc.file +# df -h +/dev/vdb 64G 753M 64G 2% /mnt/f2fs + +So, the key idea is, user can do any file operations on /dev/vdc, and +reclaim the space after the use, while the space is counted as /data. +That doesn't require modifying partition size and filesystem format. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 8bffdeccdbc3..1fbc0607363b 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -296,9 +296,8 @@ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f76460b721f..efda9a022981 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -32,7 +32,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, f2fs_build_fault_attr(sbi, 0, 0); if (!end_io) f2fs_flush_merged_writes(sbi); - f2fs_handle_critical_error(sbi, reason, end_io); + f2fs_handle_critical_error(sbi, reason); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e3ce763cce18..a2478c2afb3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1679,7 +1679,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; - if (flag != F2FS_GET_BLOCK_DIO || !is_hole) + /* DIO READ and hole case, should not map the blocks. */ + if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; @@ -1821,16 +1822,6 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } -static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) -{ - return (bytes >> inode->i_blkbits); -} - -static inline u64 blks_to_bytes(struct inode *inode, u64 blks) -{ - return (blks << inode->i_blkbits); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1856,7 +1847,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1888,7 +1879,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1904,30 +1895,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, return (err < 0 ? err : 0); } -static loff_t max_inode_blocks(struct inode *inode) -{ - loff_t result = ADDRS_PER_INODE(inode); - loff_t leaf_count = ADDRS_PER_BLOCK(inode); - - /* two direct node blocks */ - result += (leaf_count * 2); - - /* two indirect node blocks */ - leaf_count *= NIDS_PER_BLOCK; - result += (leaf_count * 2); - - /* one double indirect node block */ - leaf_count *= NIDS_PER_BLOCK; - result += leaf_count; - - return result; -} - int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; - sector_t start_blk, last_blk; + sector_t start_blk, last_blk, blk_len, max_len; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; @@ -1969,16 +1941,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (bytes_to_blks(inode, len) == 0) - len = blks_to_bytes(inode, 1); - - start_blk = bytes_to_blks(inode, start); - last_blk = bytes_to_blks(inode, start + len - 1); + start_blk = F2FS_BYTES_TO_BLK(start); + last_blk = F2FS_BYTES_TO_BLK(start + len - 1); + blk_len = last_blk - start_blk + 1; + max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; - map.m_len = bytes_to_blks(inode, len); + map.m_len = blk_len; map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; @@ -1995,13 +1966,23 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, - max_inode_blocks(inode))) + if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } + /* + * current extent may cross boundary of inquiry, increase len to + * requery. + */ + if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && + map.m_lblk + map.m_len - 1 == last_blk && + blk_len != max_len) { + blk_len = max_len; + goto next; + } + compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || @@ -2033,14 +2014,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; - size += blks_to_bytes(inode, appended_blks); + size += F2FS_BLK_TO_BYTES(appended_blks); start_blk += appended_blks; compr_cluster = false; } else { - logical = blks_to_bytes(inode, start_blk); + logical = F2FS_BLK_TO_BYTES(start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? - blks_to_bytes(inode, map.m_pblk) : 0; - size = blks_to_bytes(inode, map.m_len); + F2FS_BLK_TO_BYTES(map.m_pblk) : 0; + size = F2FS_BLK_TO_BYTES(map.m_len); flags = 0; if (compr_cluster) { @@ -2048,13 +2029,13 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; - size += blks_to_bytes(inode, 1); + size += F2FS_BLKSIZE; } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } - start_blk += bytes_to_blks(inode, size); + start_blk += F2FS_BYTES_TO_BLK(size); } prep_next: @@ -2092,7 +2073,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, struct readahead_control *rac) { struct bio *bio = *bio_ret; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -2102,8 +2083,8 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -2203,7 +2184,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; @@ -2212,8 +2193,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2388,10 +2369,10 @@ static int f2fs_mpage_readpages(struct inode *inode, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; + pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; - pgoff_t index; int ret = 0; map.m_pblk = 0; @@ -2409,9 +2390,9 @@ static int f2fs_mpage_readpages(struct inode *inode, prefetchw(&folio->flags); } +#ifdef CONFIG_F2FS_FS_COMPRESSION index = folio_index(folio); -#ifdef CONFIG_F2FS_FS_COMPRESSION if (!f2fs_compressed_file(inode)) goto read_single_page; @@ -3444,6 +3425,11 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { + if (IS_DEVICE_ALIASING(inode)) { + err = -ENODATA; + goto out; + } + if (locked) { err = f2fs_reserve_block(&dn, index); goto out; @@ -3974,7 +3960,7 @@ static int check_swap_activate(struct swap_info_struct *sis, * to be very smart. */ cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); + last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; @@ -4217,8 +4203,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, pgoff_t next_pgofs = 0; int err; - map.m_lblk = bytes_to_blks(inode, offset); - map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_lblk = F2FS_BYTES_TO_BLK(offset); + map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -4229,7 +4215,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (err) return err; - iomap->offset = blks_to_bytes(inode, map.m_lblk); + iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file @@ -4249,21 +4235,21 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; - iomap->addr = blks_to_bytes(inode, map.m_pblk); + iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; if (map.m_pblk == NULL_ADDR) { - iomap->length = blks_to_bytes(inode, next_pgofs) - - iomap->offset; + iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - + iomap->offset; iomap->type = IOMAP_HOLE; } else if (map.m_pblk == NEW_ADDR) { - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_UNWRITTEN; } else { f2fs_bug_on(F2FS_I_SB(inode), 1); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 546b8ba91261..468828288a4a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -60,6 +60,70 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_DEBUG_FS +static void update_multidevice_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_dev_stats *dev_stats = si->dev_stats; + int i, j; + + if (!f2fs_is_multi_device(sbi)) + return; + + memset(dev_stats, 0, sizeof(struct f2fs_dev_stats) * sbi->s_ndevs); + for (i = 0; i < sbi->s_ndevs; i++) { + unsigned int start_segno, end_segno; + block_t start_blk, end_blk; + + if (i == 0) { + start_blk = MAIN_BLKADDR(sbi); + end_blk = FDEV(i).end_blk + 1 - SEG0_BLKADDR(sbi); + } else { + start_blk = FDEV(i).start_blk; + end_blk = FDEV(i).end_blk + 1; + } + + start_segno = GET_SEGNO(sbi, start_blk); + end_segno = GET_SEGNO(sbi, end_blk); + + for (j = start_segno; j < end_segno; j++) { + unsigned int seg_blks, sec_blks; + + seg_blks = get_seg_entry(sbi, j)->valid_blocks; + + /* update segment stats */ + if (IS_CURSEG(sbi, j)) + dev_stats[i].devstats[0][DEVSTAT_INUSE]++; + else if (seg_blks == BLKS_PER_SEG(sbi)) + dev_stats[i].devstats[0][DEVSTAT_FULL]++; + else if (seg_blks != 0) + dev_stats[i].devstats[0][DEVSTAT_DIRTY]++; + else if (!test_bit(j, FREE_I(sbi)->free_segmap)) + dev_stats[i].devstats[0][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[0][DEVSTAT_PREFREE]++; + + if (!__is_large_section(sbi) || + (j % SEGS_PER_SEC(sbi)) != 0) + continue; + + sec_blks = get_sec_entry(sbi, j)->valid_blocks; + + /* update section stats */ + if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + dev_stats[i].devstats[1][DEVSTAT_INUSE]++; + else if (sec_blks == BLKS_PER_SEC(sbi)) + dev_stats[i].devstats[1][DEVSTAT_FULL]++; + else if (sec_blks != 0) + dev_stats[i].devstats[1][DEVSTAT_DIRTY]++; + else if (!test_bit(GET_SEC_FROM_SEG(sbi, j), + FREE_I(sbi)->free_secmap)) + dev_stats[i].devstats[1][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[1][DEVSTAT_PREFREE]++; + } + } +} + static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -214,6 +278,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_blks[type] += blks; } + update_multidevice_stats(sbi); + for (i = 0; i < MAX_CALL_TYPE; i++) si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); @@ -498,6 +564,36 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + if (f2fs_is_multi_device(sbi)) { + seq_puts(s, "Multidevice stats:\n"); + seq_printf(s, " [seg: %8s %8s %8s %8s %8s]", + "inuse", "dirty", "full", "free", "prefree"); + if (__is_large_section(sbi)) + seq_printf(s, " [sec: %8s %8s %8s %8s %8s]\n", + "inuse", "dirty", "full", "free", "prefree"); + else + seq_puts(s, "\n"); + + for (i = 0; i < sbi->s_ndevs; i++) { + seq_printf(s, " #%-2d %8u %8u %8u %8u %8u", i, + si->dev_stats[i].devstats[0][DEVSTAT_INUSE], + si->dev_stats[i].devstats[0][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[0][DEVSTAT_FULL], + si->dev_stats[i].devstats[0][DEVSTAT_FREE], + si->dev_stats[i].devstats[0][DEVSTAT_PREFREE]); + if (!__is_large_section(sbi)) { + seq_puts(s, "\n"); + continue; + } + seq_printf(s, " %8u %8u %8u %8u %8u\n", + si->dev_stats[i].devstats[1][DEVSTAT_INUSE], + si->dev_stats[i].devstats[1][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[1][DEVSTAT_FULL], + si->dev_stats[i].devstats[1][DEVSTAT_FREE], + si->dev_stats[i].devstats[1][DEVSTAT_PREFREE]); + } + seq_puts(s, "\n"); + } seq_printf(s, "CP calls: %d (BG: %d)\n", si->cp_call_count[TOTAL_CALL], si->cp_call_count[BACKGROUND]); @@ -598,9 +694,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - data: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + seq_printf(s, " - quota data: %4d in quota files:%4d\n", si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); @@ -665,6 +761,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + struct f2fs_dev_stats *dev_stats; unsigned long flags; int i; @@ -672,6 +769,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) if (!si) return -ENOMEM; + dev_stats = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_stats) * + sbi->s_ndevs, GFP_KERNEL); + if (!dev_stats) { + kfree(si); + return -ENOMEM; + } + + si->dev_stats = dev_stats; + si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -724,6 +830,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + kfree(si->dev_stats); kfree(si); } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 62ac440d9416..347b3b647834 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -24,6 +24,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_info ei; + int devi; get_read_extent_info(&ei, i_ext); @@ -38,7 +39,36 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) ei.blk, ei.fofs, ei.len); return false; } - return true; + + if (!IS_DEVICE_ALIASING(inode)) + return true; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (FDEV(devi).start_blk != ei.blk || + FDEV(devi).end_blk != ei.blk + ei.len - 1) + continue; + + if (devi == 0) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) is an alias of meta device", + __func__, inode->i_ino); + return false; + } + + if (bdev_is_zoned(FDEV(devi).bdev)) { + f2fs_warn(sbi, + "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] maps to zoned block device", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; + } + return true; + } + + f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] is inconsistent w/ any devices", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; } static void __set_extent_info(struct extent_info *ei, @@ -76,6 +106,9 @@ static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) static bool __may_extent_tree(struct inode *inode, enum extent_type type) { + if (IS_DEVICE_ALIASING(inode) && type == EX_READ) + return true; + /* * for recovered files during mount do not create extents * if shrinker is not registered. @@ -346,21 +379,22 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et) + struct extent_tree *et, unsigned int nr_shrink) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = atomic_read(&et->node_cnt); + unsigned int count; node = rb_first_cached(&et->root); - while (node) { + + for (count = 0; node && count < nr_shrink; count++) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); __release_extent_node(sbi, et, en); node = next; } - return count - atomic_read(&et->node_cnt); + return count; } static void __drop_largest_extent(struct extent_tree *et, @@ -401,6 +435,11 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt) || !ei.len) goto skip; + if (IS_DEVICE_ALIASING(inode)) { + et->largest = ei; + goto skip; + } + en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); if (en) { @@ -463,6 +502,11 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } + if (IS_DEVICE_ALIASING(inode)) { + ret = false; + goto out; + } + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); if (!en) goto out; @@ -579,6 +623,30 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, return en; } +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int nr_shrink = type == EX_READ ? + READ_EXTENT_CACHE_SHRINK_NUMBER : + AGE_EXTENT_CACHE_SHRINK_NUMBER; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + while (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, nr_shrink); + write_unlock(&et->lock); + } + + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + return node_cnt; +} + static void __update_extent_tree_range(struct inode *inode, struct extent_info *tei, enum extent_type type) { @@ -649,7 +717,9 @@ static void __update_extent_tree_range(struct inode *inode, } if (end < org_end && (type != EX_READ || - org_end - end >= F2FS_MIN_EXTENT_LEN)) { + (org_end - end >= F2FS_MIN_EXTENT_LEN && + atomic_read(&et->node_cnt) < + sbi->max_read_extent_count))) { if (parts) { __set_extent_info(&ei, end, org_end - end, @@ -717,9 +787,6 @@ static void __update_extent_tree_range(struct inode *inode, } } - if (is_inode_flag_set(inode, FI_NO_EXTENT)) - __free_extent_tree(sbi, et); - if (et->largest_updated) { et->largest_updated = false; updated = true; @@ -737,6 +804,9 @@ static void __update_extent_tree_range(struct inode *inode, out_read_extent_cache: write_unlock(&et->lock); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __destroy_extent_node(inode, EX_READ); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } @@ -899,10 +969,14 @@ static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et); + node_cnt += __free_extent_tree(sbi, et, + nr_shrink - node_cnt - tree_cnt); write_unlock(&et->lock); } - f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + if (atomic_read(&et->node_cnt)) + goto unlock_out; + list_del_init(&et->list); radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); @@ -1041,23 +1115,6 @@ unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); } -static unsigned int __destroy_extent_node(struct inode *inode, - enum extent_type type) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; - unsigned int node_cnt = 0; - - if (!et || !atomic_read(&et->node_cnt)) - return 0; - - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et); - write_unlock(&et->lock); - - return node_cnt; -} - void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); @@ -1066,7 +1123,6 @@ void f2fs_destroy_extent_node(struct inode *inode) static void __drop_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; @@ -1074,7 +1130,6 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) return; write_lock(&et->lock); - __free_extent_tree(sbi, et); if (type == EX_READ) { set_inode_flag(inode, FI_NO_EXTENT); if (et->largest.len) { @@ -1083,6 +1138,9 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) } } write_unlock(&et->lock); + + __destroy_extent_node(inode, type); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } @@ -1156,6 +1214,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; sbi->last_age_weight = LAST_AGE_WEIGHT; + sbi->max_read_extent_count = DEF_MAX_READ_EXTENT_COUNT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 33f5449dc22d..6f2cbf4c5740 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -213,6 +213,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_CASEFOLD 0x00001000 #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 +#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -634,6 +635,9 @@ enum { #define DEF_HOT_DATA_AGE_THRESHOLD 262144 #define DEF_WARM_DATA_AGE_THRESHOLD 2621440 +/* default max read extent count per inode */ +#define DEF_MAX_READ_EXTENT_COUNT 10240 + /* extent cache type */ enum extent_type { EX_READ, @@ -1018,7 +1022,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) -enum { +enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ @@ -1063,7 +1067,6 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ - unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -1619,6 +1622,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; atomic64_t allocated_data_blocks; /* for block age extent_cache */ + unsigned int max_read_extent_count; /* max read extent count per inode */ /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; @@ -1758,6 +1762,7 @@ struct f2fs_sb_info { unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ + unsigned int first_zoned_segno; /* first zoned segno */ /* For write statistics */ u64 sectors_written_start; @@ -3046,6 +3051,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ #define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) @@ -3061,6 +3067,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) +#define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) + static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) @@ -3632,8 +3640,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); @@ -3754,7 +3761,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -int f2fs_get_segment_temp(int seg_type); +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3771,8 +3779,7 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); @@ -3783,6 +3790,8 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno); #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 @@ -3935,6 +3944,19 @@ void f2fs_destroy_recovery_cache(void); * debug.c */ #ifdef CONFIG_F2FS_STAT_FS +enum { + DEVSTAT_INUSE, + DEVSTAT_DIRTY, + DEVSTAT_FULL, + DEVSTAT_FREE, + DEVSTAT_PREFREE, + DEVSTAT_MAX, +}; + +struct f2fs_dev_stats { + unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ +}; + struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; @@ -3998,6 +4020,7 @@ struct f2fs_stat_info { unsigned int block_count[2]; unsigned int inplace_count; unsigned long long base_mem, cache_mem, page_mem; + struct f2fs_dev_stats *dev_stats; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) @@ -4510,6 +4533,7 @@ F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); +F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84447d5145aa..aa9679b3d8e4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -725,6 +725,11 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); + if (IS_DEVICE_ALIASING(inode) && from) { + err = -EINVAL; + goto out_err; + } + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= max_file_blocks(inode)) @@ -739,6 +744,21 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } + if (IS_DEVICE_ALIASING(inode)) { + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_info ei = et->largest; + unsigned int i; + + for (i = 0; i < ei.len; i++) + f2fs_invalidate_blocks(sbi, ei.blk + i); + + dec_valid_block_count(sbi, inode, ei.len); + f2fs_update_time(sbi, REQ_TIME); + + f2fs_put_page(ipage, 1); + goto out; + } + if (f2fs_has_inline_data(inode)) { f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); @@ -774,7 +794,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); - +out_err: trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -863,7 +883,11 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return true; if (f2fs_compressed_file(inode)) return true; - if (f2fs_has_inline_data(inode)) + /* + * only force direct read to use buffered IO, for direct write, + * it expects inline data conversion before committing IO. + */ + if (f2fs_has_inline_data(inode) && rw == READ) return true; /* disallow direct IO if any of devices has unaligned blksize */ @@ -992,7 +1016,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EPERM; if ((attr->ia_valid & ATTR_SIZE)) { - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || + IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) && !IS_ALIGNED(attr->ia_size, @@ -1790,7 +1815,8 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: - if (has_not_enough_free_secs(sbi, 0, + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); @@ -1860,7 +1886,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ @@ -2343,9 +2369,12 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, if (readonly) goto out; - /* grab sb->s_umount to avoid racing w/ remount() */ + /* + * grab sb->s_umount to avoid racing w/ remount() and other shutdown + * paths. + */ if (need_lock) - down_read(&sbi->sb->s_umount); + down_write(&sbi->sb->s_umount); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); @@ -2354,7 +2383,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, clear_opt(sbi, DISCARD); if (need_lock) - up_read(&sbi->sb->s_umount); + up_write(&sbi->sb->s_umount); f2fs_update_time(sbi, REQ_TIME); out: @@ -2861,7 +2890,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) @@ -3291,6 +3320,9 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + if (IS_DEVICE_ALIASING(inode)) + return -EINVAL; + if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", __func__, inode->i_ino, fi->i_gc_failures); @@ -3321,6 +3353,9 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (!pin && IS_DEVICE_ALIASING(inode)) + return -EOPNOTSUPP; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -3386,6 +3421,12 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) return put_user(pin, (u32 __user *)arg); } +static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) +{ + return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0, + (u32 __user *)arg); +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -3787,7 +3828,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, to_reserved = cluster_size - compr_blocks - reserved; /* for the case all blocks in cluster were reserved */ - if (to_reserved == 1) { + if (reserved && to_reserved == 1) { dn->ofs_in_node += cluster_size; goto next; } @@ -4485,6 +4526,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_decompress_file(filp); case F2FS_IOC_COMPRESS_FILE: return f2fs_ioc_compress_file(filp); + case F2FS_IOC_GET_DEV_ALIAS_FILE: + return f2fs_ioc_get_dev_alias_file(filp, arg); default: return -ENOTTY; } @@ -4760,7 +4803,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, else return 0; - map.m_may_create = true; + if (!IS_DEVICE_ALIASING(inode)) + map.m_may_create = true; if (dio) { map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); @@ -4816,8 +4860,8 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, { struct inode *inode = iter->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); - enum temp_type temp = f2fs_get_segment_temp(seg_type); + enum log_type type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); + enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); submit_bio(bio); @@ -5197,6 +5241,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_COMPRESS_OPTION: case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: + case F2FS_IOC_GET_DEV_ALIAS_FILE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9322a7200e31..3e1b6d2ff3a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -257,6 +257,8 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) switch (sbi->gc_mode) { case GC_IDLE_CB: + case GC_URGENT_LOW: + case GC_URGENT_MID: gc_mode = GC_CB; break; case GC_IDLE_GREEDY: @@ -361,20 +363,15 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; unsigned char u; - unsigned int i; unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); - for (i = 0; i < usable_segs_per_sec; i++) - mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); vblocks = get_valid_blocks(sbi, segno, true); - - mtime = div_u64(mtime, usable_segs_per_sec); vblocks = div_u64(vblocks, usable_segs_per_sec); u = BLKS_TO_SEGS(sbi, vblocks * 100); @@ -519,10 +516,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, struct victim_sel_policy *p, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; - unsigned int i; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (p->gc_mode == GC_AT && @@ -530,9 +524,8 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, return; } - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, start + i)->mtime; - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2914b678bf8f..5c1eaf55e127 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -35,6 +35,7 @@ #define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */ #define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3 #define BOOST_GC_MULTIPLE 5 +#define ZONED_PIN_SEC_REQUIRED_COUNT 1 #define DEF_GC_FAILED_PINNED_FILES 2048 #define MAX_GC_FAILED_PINNED_FILES USHRT_MAX diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1ed86df343a5..282fd320bdb3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -372,6 +372,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (IS_DEVICE_ALIASING(inode)) { + if (!f2fs_sb_has_device_alias(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + __func__, inode->i_ino); + return false; + } + if (!f2fs_is_pinned_file(inode)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + __func__, inode->i_ino); + return false; + } + } + return true; } @@ -775,8 +788,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; - if (!f2fs_is_checkpoint_ready(sbi)) + if (!f2fs_is_checkpoint_ready(sbi)) { + f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; + } /* * We need to balance fs here to prevent from producing dirty node pages @@ -823,7 +838,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); - f2fs_destroy_extent_tree(inode); + if (!IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; @@ -879,6 +895,9 @@ void f2fs_evict_inode(struct inode *inode) goto retry; } + if (IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); + if (err) { f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59b13ff243fa..0b900a7a48e5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -905,6 +905,16 @@ static int truncate_node(struct dnode_of_data *dn) if (err) return err; + if (ni.blk_addr != NEW_ADDR && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { + f2fs_err_ratelimited(sbi, + "nat entry is corrupted, run fsck to fix it, ino:%u, " + "nid:%u, blkaddr:%u", ni.ino, ni.nid, ni.blk_addr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } + /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); @@ -1056,7 +1066,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + nid[0] = get_nid(dn->inode_page, offset[0], true); if (!nid[0]) return 0; @@ -1167,7 +1177,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = get_nid(page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1199,13 +1209,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) } if (err < 0) goto fail; - if (offset[1] == 0 && - ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + if (offset[1] == 0 && get_nid(page, offset[0], true)) { lock_page(page); BUG_ON(page->mapping != NODE_MAPPING(sbi)); - f2fs_wait_on_page_writeback(page, NODE, true, true); - ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; - set_page_dirty(page); + set_nid(page, offset[0], 0, true); unlock_page(page); } offset[1] = 0; @@ -1331,7 +1338,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) err = -EFSCORRUPTED; dec_valid_node_count(sbi, dn->inode, !ofs); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + f2fs_warn_ratelimited(sbi, + "f2fs_new_node_page: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + new_ni.ino, new_ni.nid, new_ni.blk_addr, + new_ni.version, new_ni.flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); goto fail; } #endif diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e4d81b8705d1..f35be2c48e3c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -899,13 +899,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) * and the f2fs is not read only, check and fix zoned block devices' * write pointer consistency. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) { - int err2 = f2fs_fix_curseg_write_pointer(sbi); - - if (!err2) - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; + if (!err) { + err = f2fs_check_and_fix_write_pointer(sbi); ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1766254279d2..eade36c5ef13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,16 +1290,18 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, wait_list, issued); return 0; } - - /* - * Issue discard for conventional zones only if the device - * supports discard. - */ - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; } #endif + /* + * stop issuing discard for any of below cases: + * 1. device is conventional zone, but it doesn't support discard. + * 2. device is regulare device, after snapshot it doesn't support + * discard. + */ + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; @@ -2711,7 +2713,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else - segno = max(first_zoned_segno(sbi), *newseg); + segno = max(sbi->first_zoned_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif @@ -2723,7 +2725,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { - hint = GET_SEC_FROM_SEG(sbi, first_zoned_segno(sbi)); + hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, @@ -2926,7 +2928,8 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); @@ -3237,7 +3240,8 @@ int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1); + err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), + true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; @@ -3581,18 +3585,35 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } } -int f2fs_get_segment_temp(int seg_type) +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type type) { - if (IS_HOT(seg_type)) - return HOT; - else if (IS_WARM(seg_type)) - return WARM; - return COLD; + struct curseg_info *curseg = CURSEG_I(sbi, type); + enum temp_type temp = COLD; + + switch (curseg->seg_type) { + case CURSEG_HOT_NODE: + case CURSEG_HOT_DATA: + temp = HOT; + break; + case CURSEG_WARM_NODE: + case CURSEG_WARM_DATA: + temp = WARM; + break; + case CURSEG_COLD_NODE: + case CURSEG_COLD_DATA: + temp = COLD; + break; + default: + f2fs_bug_on(sbi, 1); + } + + return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { - int type = 0; + enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: @@ -3608,7 +3629,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) f2fs_bug_on(fio->sbi, true); } - fio->temp = f2fs_get_segment_temp(type); + fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } @@ -3793,10 +3814,35 @@ void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, } } +static int log_type_to_seg_type(enum log_type type) +{ + int seg_type = CURSEG_COLD_DATA; + + switch (type) { + case CURSEG_HOT_DATA: + case CURSEG_WARM_DATA: + case CURSEG_COLD_DATA: + case CURSEG_HOT_NODE: + case CURSEG_WARM_NODE: + case CURSEG_COLD_NODE: + seg_type = (int)type; + break; + case CURSEG_COLD_DATA_PINNED: + case CURSEG_ALL_DATA_ATGC: + seg_type = CURSEG_COLD_DATA; + break; + default: + break; + } + return seg_type; +} + static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio); - bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); + enum log_type type = __get_segment_type(fio); + int seg_type = log_type_to_seg_type(type); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && + seg_type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); @@ -3977,8 +4023,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } - f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); + f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); @@ -4778,12 +4824,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; - if (i < NR_PERSISTENT_LOG) - array[i].seg_type = CURSEG_HOT_DATA + i; - else if (i == CURSEG_COLD_DATA_PINNED) - array[i].seg_type = CURSEG_COLD_DATA; - else if (i == CURSEG_ALL_DATA_ATGC) - array[i].seg_type = CURSEG_COLD_DATA; + array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); @@ -5207,7 +5248,7 @@ static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; @@ -5312,12 +5353,12 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; } -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { - ret = fix_curseg_write_pointer(sbi, i); + ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } @@ -5340,7 +5381,7 @@ static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, return check_zone_write_pointer(args->sbi, args->fdev, zone); } -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; @@ -5360,6 +5401,20 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) +{ + int ret; + + if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb)) + return 0; + + f2fs_notice(sbi, "Checking entire write pointers"); + ret = fix_curseg_write_pointer(sbi); + if (!ret) + ret = check_write_pointer(sbi); + return ret; +} + /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for @@ -5396,12 +5451,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( return BLKS_PER_SEG(sbi); } #else -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) -{ - return 0; -} - -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } @@ -5430,6 +5480,35 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) return SEGS_PER_SEC(sbi); } +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); + unsigned int secno = 0, start = 0; + unsigned int total_valid_blocks = 0; + unsigned long long mtime = 0; + unsigned int i = 0; + + secno = GET_SEC_FROM_SEG(sbi, segno); + start = GET_SEG_FROM_SEC(sbi, secno); + + if (!__is_large_section(sbi)) + return get_seg_entry(sbi, start + i)->mtime; + + for (i = 0; i < usable_segs_per_sec; i++) { + /* for large section, only check the mtime of valid segments */ + struct seg_entry *se = get_seg_entry(sbi, start+i); + + mtime += se->mtime * se->valid_blocks; + total_valid_blocks += se->valid_blocks; + } + + if (total_valid_blocks == 0) + return INVALID_MTIME; + + return div_u64(mtime, total_valid_blocks); +} + /* * Update min, max modified time for cost-benefit GC algorithm */ @@ -5443,13 +5522,9 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { - unsigned int i; unsigned long long mtime = 0; - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, segno + i)->mtime; - - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 71adb4a43bec..943be4f1d6d2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ #define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ +#define INVALID_MTIME ULLONG_MAX /* no valid blocks in a segment/section */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) @@ -32,10 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) -#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) -#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) - #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ @@ -524,8 +522,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments + - SM_I(sbi)->additional_reserved_segments; + return SM_I(sbi)->reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) @@ -559,18 +556,21 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, - unsigned int node_blocks, unsigned int dent_blocks) + unsigned int node_blocks, unsigned int data_blocks, + unsigned int dent_blocks) { - unsigned segno, left_blocks; + unsigned int segno, left_blocks, blocks; int i; - /* check current node sections in the worst case. */ - for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + /* check current data/node sections in the worst case. */ + for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); - if (node_blocks > left_blocks) + + blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; + if (blocks > left_blocks) return false; } @@ -584,8 +584,9 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, } /* - * calculate needed sections for dirty node/dentry - * and call has_curseg_enough_space + * calculate needed sections for dirty node/dentry and call + * has_curseg_enough_space, please note that, it needs to account + * dirty data as well in lfs mode when checkpoint is disabled. */ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) @@ -594,19 +595,30 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int total_data_blocks = 0; unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int data_secs = 0; unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int data_blocks = 0; + + if (f2fs_lfs_mode(sbi) && + unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); + data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); + data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + } if (lower_p) - *lower_p = node_secs + dent_secs; + *lower_p = node_secs + dent_secs + data_secs; if (upper_p) *upper_p = node_secs + dent_secs + - (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + + (data_blocks ? 1 : 0); if (curseg_p) *curseg_p = has_curseg_enough_space(sbi, - node_blocks, dent_blocks); + node_blocks, data_blocks, dent_blocks); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -637,12 +649,30 @@ static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, return !has_not_enough_free_secs(sbi, freed, needed); } +static inline bool has_enough_free_blks(struct f2fs_sb_info *sbi) +{ + unsigned int total_free_blocks = 0; + unsigned int avail_user_block_count; + + spin_lock(&sbi->stat_lock); + + avail_user_block_count = get_available_block_count(sbi, NULL, true); + total_free_blocks = avail_user_block_count - (unsigned int)valid_user_blocks(sbi); + + spin_unlock(&sbi->stat_lock); + + return total_free_blocks > 0; +} + static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (likely(has_enough_free_secs(sbi, 0, 0))) return true; + if (!f2fs_lfs_mode(sbi) && + likely(has_enough_free_blks(sbi))) + return true; return false; } @@ -957,13 +987,3 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } - -static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi) -{ - int devi; - - for (devi = 0; devi < sbi->s_ndevs; devi++) - if (bdev_is_zoned(FDEV(devi).bdev)) - return GET_SEGNO(sbi, FDEV(devi).start_blk); - return 0; -} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87ab5696bd48..fc7d463dee15 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -150,6 +150,8 @@ enum { Opt_mode, Opt_fault_injection, Opt_fault_type, + Opt_lazytime, + Opt_nolazytime, Opt_quota, Opt_noquota, Opt_usrquota, @@ -226,6 +228,8 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_fault_type, "fault_type=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, @@ -834,6 +838,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: + if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + f2fs_err(sbi, "device aliasing requires extent cache"); + return -EINVAL; + } clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: @@ -918,6 +926,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "fault_type options not supported"); break; #endif + case Opt_lazytime: + sb->s_flags |= SB_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~SB_LAZYTIME; + break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: @@ -1158,7 +1172,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(ext[ext_cnt], name); + ret = strscpy(ext[ext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; @@ -1187,7 +1205,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(noext[noext_cnt], name); + ret = strscpy(noext[noext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); break; @@ -1738,6 +1760,18 @@ static int f2fs_freeze(struct super_block *sb) static int f2fs_unfreeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + /* + * It will update discard_max_bytes of mounted lvm device to zero + * after creating snapshot on this lvm device, let's drop all + * remained discards. + * We don't need to disable real-time discard because discard_max_bytes + * will recover after removal of snapshot. + */ + if (test_opt(sbi, DISCARD) && !f2fs_hw_support_discard(sbi)) + f2fs_issue_discard_timeout(sbi); + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; } @@ -2474,6 +2508,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + adjust_unusable_cap_perc(sbi); if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); @@ -2518,7 +2553,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_checkpoint: @@ -3322,7 +3356,7 @@ loff_t max_file_blocks(struct inode *inode) * fit within U32_MAX + 1 data units. */ - result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); + result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); return result; } @@ -4155,8 +4189,7 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context) +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) { struct super_block *sb = sbi->sb; bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; @@ -4168,10 +4201,12 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, if (!f2fs_hw_is_readonly(sbi)) { save_stop_reason(sbi, reason); - if (irq_context && !shutdown) - schedule_work(&sbi->s_error_work); - else - f2fs_record_stop_reason(sbi); + /* + * always create an asynchronous task to record stop_reason + * in order to avoid potential deadlock when running into + * f2fs_record_stop_reason() synchronously. + */ + schedule_work(&sbi->s_error_work); } /* @@ -4217,6 +4252,16 @@ static void f2fs_record_error_work(struct work_struct *work) f2fs_record_stop_reason(sbi); } +static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi) +{ + int devi; + + for (devi = 0; devi < sbi->s_ndevs; devi++) + if (bdev_is_zoned(FDEV(devi).bdev)) + return GET_SEGNO(sbi, FDEV(devi).start_blk); + return 0; +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4617,6 +4662,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); + /* get segno of first zoned block device */ + sbi->first_zoned_segno = get_first_zoned_segno(sbi); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) @@ -4738,26 +4786,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, - * check zoned block devices' write pointer consistency. + * write pointer consistency of cursegs and other zones are already + * checked and fixed during recovery. However, if recovery fails, + * write pointers are left untouched, and retry-mount should check + * them here. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) { - int err2; - - f2fs_notice(sbi, "Checking entire write pointers"); - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; - } + if (skip_recovery) + err = f2fs_check_and_fix_write_pointer(sbi); if (err) goto free_meta; + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + err = f2fs_init_inmem_curseg(sbi); if (err) goto sync_free_meta; - /* f2fs_recover_fsync_data() cleared this already */ - clear_sbi_flag(sbi, SBI_POR_DOING); - if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) @@ -4991,9 +5036,6 @@ static int __init init_f2fs_fs(void) err = f2fs_init_shrinker(); if (err) goto free_sysfs; - err = register_filesystem(&f2fs_fs_type); - if (err) - goto free_shrinker; f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) @@ -5016,7 +5058,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_casefold_cache(); if (err) goto free_compress_cache; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_casefold_cache; return 0; +free_casefold_cache: + f2fs_destroy_casefold_cache(); free_compress_cache: f2fs_destroy_compress_cache(); free_compress_mempool: @@ -5031,8 +5078,6 @@ static int __init init_f2fs_fs(void) f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); -free_shrinker: f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); @@ -5056,6 +5101,7 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { + unregister_filesystem(&f2fs_fs_type); f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); @@ -5064,7 +5110,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c56e8c873935..6b99dc49f776 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -501,9 +501,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks - - SEGS_TO_BLKS(sbi, - SM_I(sbi)->additional_reserved_segments))) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -789,6 +787,13 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "max_read_extent_count")) { + if (t > UINT_MAX) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + if (!strcmp(a->attr.name, "ipu_policy")) { if (t >= BIT(F2FS_IPU_MAX)) return -EINVAL; @@ -1054,6 +1059,8 @@ F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +/* read extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); #ifdef CONFIG_BLK_DEV_ZONED F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); @@ -1244,6 +1251,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), + ATTR_LIST(max_read_extent_count), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1313,6 +1321,7 @@ F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); F2FS_SB_FEATURE_RO_ATTR(readonly, RO); +F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_encryption), @@ -1329,6 +1338,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_casefold), ATTR_LIST(sb_compression), ATTR_LIST(sb_readonly), + ATTR_LIST(sb_device_alias), NULL, }; ATTRIBUTE_GROUPS(f2fs_sb_feat); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index b0b821edfd97..c24f8bc01045 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,10 +24,11 @@ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ #define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */ -#define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) -#define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BLKSIZE_MASK (F2FS_BLKSIZE - 1) +#define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS) +#define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) -#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) +#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 955d440be104..f7aaf8d23e20 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -43,6 +43,7 @@ #define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) #define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) +#define F2FS_IOC_GET_DEV_ALIAS_FILE _IOR(F2FS_IOCTL_MAGIC, 26, __u32) /* * should be same as XFS_IOC_GOINGDOWN.