Changes since last update:

- support direct I/O for all uncompressed files;
 
  - support fsdax for non-tailpacking regular files;
 
  - use iomap infrastructure for all uncompressed cases;
 
  - support fiemap for both (un)compressed files;
 
  - introduce chunk-based files for chunk deduplication.
 
  - some cleanups.
 -----BEGIN PGP SIGNATURE-----
 
 iIcEABYIAC8WIQThPAmQN9sSA0DVxtI5NzHcH7XmBAUCYS03PBEceGlhbmdAa2Vy
 bmVsLm9yZwAKCRA5NzHcH7XmBNdRAQCzumTFT7TRrA8PmcNB8viVrDO07czJRpwz
 Fsh0UdH3IQEAsESa11vmQjbKWATCB3g81SMZeeDqGEb4HYLDhgbS+gw=
 =98HB
 -----END PGP SIGNATURE-----

Merge tag 'erofs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs

Pull erofs updates from Gao Xiang:
 "In this cycle, direct I/O and fsdax support for uncompressed files are
  now added in order to avoid double-caching for loop device and VM
  container use cases. All uncompressed cases are now turned into iomap
  infrastructure, which looks much simpler and cleaner.

  In addition, fiemap support is added for both (un)compressed files by
  using iomap infrastructure as well so end users can easily get file
  distribution. We've also added chunk-based uncompressed files support
  for data deduplication as the next step of VM container use cases.

  Summary:

   - support direct I/O for all uncompressed files

   - support fsdax for non-tailpacking regular files

   - use iomap infrastructure for all uncompressed cases

   - support fiemap for both (un)compressed files

   - introduce chunk-based files for chunk deduplication

   - some cleanups"

* tag 'erofs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: fix double free of 'copied'
  erofs: support reading chunk-based uncompressed files
  erofs: introduce chunk-based file on-disk format
  erofs: add fiemap support with iomap
  erofs: add support for the full decompressed length
  erofs: remove the mapping parameter from erofs_try_to_free_cached_page()
  erofs: directly use wrapper erofs_page_is_managed() when shrinking
  erofs: convert all uncompressed cases to iomap
  erofs: dax support for non-tailpacking regular file
  erofs: iomap support for non-tailpacking DIO
This commit is contained in:
Linus Torvalds 2021-09-02 09:12:05 -07:00
commit 412106c203
10 changed files with 525 additions and 225 deletions

View File

@ -84,6 +84,9 @@ cache_strategy=%s Select a strategy for cached decompression from now on:
It still does in-place I/O decompression
for the rest compressed physical clusters.
========== =============================================
dax={always,never} Use direct access (no page cache). See
Documentation/filesystems/dax.rst.
dax A legacy option which is an alias for ``dax=always``.
=================== =========================================================
On-disk details
@ -153,13 +156,14 @@ may not. All metadatas can be now observed in two different spaces (views):
Xattrs, extents, data inline are followed by the corresponding inode with
proper alignment, and they could be optional for different data mappings.
_currently_ total 4 valid data mappings are supported:
_currently_ total 5 data layouts are supported:
== ====================================================================
0 flat file data without data inline (no extent);
1 fixed-sized output data compression (with non-compacted indexes);
2 flat file data with tail packing data inline (no extent);
3 fixed-sized output data compression (with compacted indexes, v5.3+).
3 fixed-sized output data compression (with compacted indexes, v5.3+);
4 chunk-based file (v5.15+).
== ====================================================================
The size of the optional xattrs is indicated by i_xattr_count in inode
@ -210,6 +214,17 @@ Note that apart from the offset of the first filename, nameoff0 also indicates
the total number of directory entries in this block since it is no need to
introduce another on-disk field at all.
Chunk-based file
----------------
In order to support chunk-based data deduplication, a new inode data layout has
been supported since Linux v5.15: Files are split in equal-sized data chunks
with ``extents`` area of the inode metadata indicating how to get the chunk
data: these can be simply as a 4-byte block address array or in the 8-byte
chunk index form (see struct erofs_inode_chunk_index in erofs_fs.h for more
details.)
By the way, chunk-based files are all uncompressed for now.
Data compression
----------------
EROFS implements LZ4 fixed-sized output compression which generates fixed-sized

View File

@ -3,6 +3,7 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
select FS_IOMAP
select LIBCRC32C
help
EROFS (Enhanced Read-Only File System) is a lightweight

View File

@ -2,35 +2,13 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
#include <linux/prefetch.h>
#include <linux/dax.h>
#include <trace/events/erofs.h>
static void erofs_readendio(struct bio *bio)
{
struct bio_vec *bvec;
blk_status_t err = bio->bi_status;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
/* page is already locked */
DBG_BUGON(PageUptodate(page));
if (err)
SetPageError(page);
else
SetPageUptodate(page);
unlock_page(page);
/* page could be reclaimed now */
}
bio_put(bio);
}
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
{
struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
@ -59,13 +37,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
lastblk = nblocks - tailendpacking;
if (offset >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
map->m_plen = 0;
goto out;
}
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
@ -100,217 +71,273 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
goto err_out;
}
out:
map->m_llen = map->m_plen;
err_out:
trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
return err;
}
static inline struct bio *erofs_read_raw_page(struct bio *bio,
struct address_space *mapping,
struct page *page,
erofs_off_t *last_block,
unsigned int nblocks,
unsigned int *eblks,
bool ra)
static int erofs_map_blocks(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
struct inode *const inode = mapping->host;
struct super_block *const sb = inode->i_sb;
erofs_off_t current_block = (erofs_off_t)page->index;
int err;
struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
struct page *page;
u64 chunknr;
unsigned int unit;
erofs_off_t pos;
int err = 0;
DBG_BUGON(!nblocks);
if (PageUptodate(page)) {
err = 0;
goto has_updated;
if (map->m_la >= inode->i_size) {
/* leave out-of-bound access unmapped */
map->m_flags = 0;
map->m_plen = 0;
goto out;
}
/* note that for readpage case, bio also equals to NULL */
if (bio &&
(*last_block + 1 != current_block || !*eblks)) {
submit_bio_retry:
submit_bio(bio);
bio = NULL;
}
if (vi->datalayout != EROFS_INODE_CHUNK_BASED)
return erofs_map_blocks_flatmode(inode, map, flags);
if (!bio) {
struct erofs_map_blocks map = {
.m_la = blknr_to_addr(current_block),
};
erofs_blk_t blknr;
unsigned int blkoff;
if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
unit = sizeof(*idx); /* chunk index */
else
unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
if (err)
goto err_out;
chunknr = map->m_la >> vi->chunkbits;
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
/* zero out the holed page */
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos));
if (IS_ERR(page))
return PTR_ERR(page);
/* imply err = 0, see erofs_map_blocks */
goto has_updated;
map->m_la = chunknr << vi->chunkbits;
map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
/* handle block map */
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
__le32 *blkaddr = page_address(page) + erofs_blkoff(pos);
if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
map->m_flags = 0;
} else {
map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
}
/* for RAW access mode, m_plen must be equal to m_llen */
DBG_BUGON(map.m_plen != map.m_llen);
blknr = erofs_blknr(map.m_pa);
blkoff = erofs_blkoff(map.m_pa);
/* deal with inline page */
if (map.m_flags & EROFS_MAP_META) {
void *vsrc, *vto;
struct page *ipage;
DBG_BUGON(map.m_plen > PAGE_SIZE);
ipage = erofs_get_meta_page(inode->i_sb, blknr);
if (IS_ERR(ipage)) {
err = PTR_ERR(ipage);
goto err_out;
}
vsrc = kmap_atomic(ipage);
vto = kmap_atomic(page);
memcpy(vto, vsrc + blkoff, map.m_plen);
memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
kunmap_atomic(vto);
kunmap_atomic(vsrc);
flush_dcache_page(page);
SetPageUptodate(page);
/* TODO: could we unlock the page earlier? */
unlock_page(ipage);
put_page(ipage);
/* imply err = 0, see erofs_map_blocks */
goto has_updated;
goto out_unlock;
}
/* parse chunk indexes */
idx = page_address(page) + erofs_blkoff(pos);
switch (le32_to_cpu(idx->blkaddr)) {
case EROFS_NULL_ADDR:
map->m_flags = 0;
break;
default:
/* only one device is supported for now */
if (idx->device_id) {
erofs_err(sb, "invalid device id %u @ %llu for nid %llu",
le16_to_cpu(idx->device_id),
chunknr, vi->nid);
err = -EFSCORRUPTED;
goto out_unlock;
}
/* pa must be block-aligned for raw reading */
DBG_BUGON(erofs_blkoff(map.m_pa));
/* max # of continuous pages */
if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
*eblks = bio_max_segs(nblocks);
bio = bio_alloc(GFP_NOIO, *eblks);
bio->bi_end_io = erofs_readendio;
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)blknr <<
LOG_SECTORS_PER_BLOCK;
bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
map->m_flags = EROFS_MAP_MAPPED;
break;
}
err = bio_add_page(bio, page, PAGE_SIZE, 0);
/* out of the extent or bio is full */
if (err < PAGE_SIZE)
goto submit_bio_retry;
--*eblks;
*last_block = current_block;
return bio;
err_out:
/* for sync reading, set page error immediately */
if (!ra) {
SetPageError(page);
ClearPageUptodate(page);
}
has_updated:
out_unlock:
unlock_page(page);
put_page(page);
out:
map->m_llen = map->m_plen;
return err;
}
/* if updated manually, continuous pages has a gap */
if (bio)
submit_bio(bio);
return err ? ERR_PTR(err) : NULL;
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map;
map.m_la = offset;
map.m_llen = length;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret < 0)
return ret;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = EROFS_I_SB(inode)->dax_dev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
if (!iomap->length)
iomap->length = length;
return 0;
}
if (map.m_flags & EROFS_MAP_META) {
struct page *ipage;
iomap->type = IOMAP_INLINE;
ipage = erofs_get_meta_page(inode->i_sb,
erofs_blknr(map.m_pa));
if (IS_ERR(ipage))
return PTR_ERR(ipage);
iomap->inline_data = page_address(ipage) +
erofs_blkoff(map.m_pa);
iomap->private = ipage;
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = map.m_pa;
}
return 0;
}
static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
ssize_t written, unsigned int flags, struct iomap *iomap)
{
struct page *ipage = iomap->private;
if (ipage) {
DBG_BUGON(iomap->type != IOMAP_INLINE);
unlock_page(ipage);
put_page(ipage);
} else {
DBG_BUGON(iomap->type == IOMAP_INLINE);
}
return written;
}
static const struct iomap_ops erofs_iomap_ops = {
.iomap_begin = erofs_iomap_begin,
.iomap_end = erofs_iomap_end,
};
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
return iomap_fiemap(inode, fieinfo, start, len,
&z_erofs_iomap_report_ops);
#else
return -EOPNOTSUPP;
#endif
}
return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}
/*
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
static int erofs_raw_access_readpage(struct file *file, struct page *page)
static int erofs_readpage(struct file *file, struct page *page)
{
erofs_off_t last_block;
unsigned int eblks;
struct bio *bio;
trace_erofs_readpage(page, true);
bio = erofs_read_raw_page(NULL, page->mapping,
page, &last_block, 1, &eblks, false);
if (IS_ERR(bio))
return PTR_ERR(bio);
if (bio)
submit_bio(bio);
return 0;
return iomap_readpage(page, &erofs_iomap_ops);
}
static void erofs_raw_access_readahead(struct readahead_control *rac)
static void erofs_readahead(struct readahead_control *rac)
{
erofs_off_t last_block;
unsigned int eblks;
struct bio *bio = NULL;
struct page *page;
trace_erofs_readpages(rac->mapping->host, readahead_index(rac),
readahead_count(rac), true);
while ((page = readahead_page(rac))) {
prefetchw(&page->flags);
bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block,
readahead_count(rac), &eblks, true);
/* all the page errors are ignored when readahead */
if (IS_ERR(bio)) {
pr_err("%s, readahead error at page %lu of nid %llu\n",
__func__, page->index,
EROFS_I(rac->mapping->host)->nid);
bio = NULL;
}
put_page(page);
}
if (bio)
submit_bio(bio);
return iomap_readahead(rac, &erofs_iomap_ops);
}
static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
struct erofs_map_blocks map = {
.m_la = blknr_to_addr(block),
};
return iomap_bmap(mapping, block, &erofs_iomap_ops);
}
if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
loff_t align = iocb->ki_pos | iov_iter_count(to) |
iov_iter_alignment(to);
struct block_device *bdev = inode->i_sb->s_bdev;
unsigned int blksize_mask;
if (block >> LOG_SECTORS_PER_BLOCK >= blks)
return 0;
}
if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
return erofs_blknr(map.m_pa);
if (bdev)
blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
else
blksize_mask = (1 << inode->i_blkbits) - 1;
if (align & blksize_mask)
return -EINVAL;
return 0;
}
static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
/* no need taking (shared) inode lock since it's a ro filesystem */
if (!iov_iter_count(to))
return 0;
#ifdef CONFIG_FS_DAX
if (IS_DAX(iocb->ki_filp->f_mapping->host))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
if (iocb->ki_flags & IOCB_DIRECT) {
int err = erofs_prepare_dio(iocb, to);
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0);
if (err < 0)
return err;
}
return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
.readpage = erofs_raw_access_readpage,
.readahead = erofs_raw_access_readahead,
.readpage = erofs_readpage,
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
};
#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
}
static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
}
static const struct vm_operations_struct erofs_dax_vm_ops = {
.fault = erofs_dax_fault,
.huge_fault = erofs_dax_huge_fault,
};
static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!IS_DAX(file_inode(file)))
return generic_file_readonly_mmap(file, vma);
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
return -EINVAL;
vma->vm_ops = &erofs_dax_vm_ops;
vma->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
#define erofs_file_mmap generic_file_readonly_mmap
#endif
const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
.splice_read = generic_file_splice_read,
};

View File

@ -4,6 +4,7 @@
*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
@ -19,10 +20,12 @@
#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
#define EROFS_ALL_FEATURE_INCOMPAT \
(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
EROFS_FEATURE_INCOMPAT_CHUNKED_FILE)
#define EROFS_SB_EXTSLOT_SIZE 16
@ -64,13 +67,16 @@ struct erofs_super_block {
* inode, [xattrs], last_inline_data, ... | ... | no-holed data
* 3 - inode compression D:
* inode, [xattrs], map_header, extents ... | ...
* 4~7 - reserved
* 4 - inode chunk-based E:
* inode, [xattrs], chunk indexes ... | ...
* 5~7 - reserved
*/
enum {
EROFS_INODE_FLAT_PLAIN = 0,
EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1,
EROFS_INODE_FLAT_INLINE = 2,
EROFS_INODE_FLAT_COMPRESSION = 3,
EROFS_INODE_CHUNK_BASED = 4,
EROFS_INODE_DATALAYOUT_MAX
};
@ -90,6 +96,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_ALL \
((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
/* with chunk indexes or just a 4-byte blkaddr array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
#define EROFS_CHUNK_FORMAT_ALL \
(EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
struct erofs_inode_chunk_info {
__le16 format; /* chunk blkbits, etc. */
__le16 reserved;
};
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
@ -107,6 +126,9 @@ struct erofs_inode_compact {
/* for device files, used to indicate old/new device # */
__le32 rdev;
/* for chunk-based files, it contains the summary info */
struct erofs_inode_chunk_info c;
} i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
@ -135,6 +157,9 @@ struct erofs_inode_extended {
/* for device files, used to indicate old/new device # */
__le32 rdev;
/* for chunk-based files, it contains the summary info */
struct erofs_inode_chunk_info c;
} i_u;
/* only used for 32-bit stat compatibility */
@ -204,6 +229,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
e->e_name_len + le16_to_cpu(e->e_value_size));
}
/* represent a zeroed chunk (hole) */
#define EROFS_NULL_ADDR -1
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
/* 8-byte inode chunk indexes */
struct erofs_inode_chunk_index {
__le16 advise; /* always 0, don't care for now */
__le16 device_id; /* back-end storage id, always 0 for now */
__le32 blkaddr; /* start block address of this inode chunk */
};
/* maximum supported size of a physical compression cluster */
#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
@ -338,9 +376,14 @@ static inline void erofs_check_ondisk_layout_definitions(void)
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
/* keep in sync between 2 index structures for better extendibility */
BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
sizeof(struct z_erofs_vle_decompressed_index));
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);

View File

@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
@ -122,8 +123,11 @@ static struct page *erofs_read_inode(struct inode *inode,
/* total blocks for compressed files */
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(die->i_u.compressed_blocks);
else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
/* fill chunked inode summary info */
vi->chunkformat = le16_to_cpu(die->i_u.c.format);
kfree(copied);
copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
@ -160,6 +164,8 @@ static struct page *erofs_read_inode(struct inode *inode,
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(dic->i_u.compressed_blocks);
else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
erofs_err(inode->i_sb,
@ -169,11 +175,26 @@ static struct page *erofs_read_inode(struct inode *inode,
goto err_out;
}
if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) {
erofs_err(inode->i_sb,
"unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
err = -EOPNOTSUPP;
goto err_out;
}
vi->chunkbits = LOG_BLOCK_SIZE +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
vi->datalayout == EROFS_INODE_FLAT_PLAIN)
inode->i_flags |= S_DAX;
if (!nblks)
/* measure inode.i_blocks as generic filesystems */
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
@ -247,7 +268,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
inode->i_fop = &generic_ro_fops;
if (erofs_inode_is_data_compressed(vi->datalayout))
inode->i_fop = &generic_ro_fops;
else
inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
@ -358,6 +382,7 @@ const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};
const struct inode_operations erofs_symlink_iops = {

View File

@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
* Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_INTERNAL_H
#define __EROFS_INTERNAL_H
@ -15,6 +16,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/iomap.h>
#include "erofs_fs.h"
/* redefine pr_fmt "erofs: " */
@ -83,6 +85,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
struct dax_device *dax_dev;
u32 blocks;
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
@ -115,6 +118,8 @@ struct erofs_sb_info {
/* Mount flags set via mount options or defaults */
#define EROFS_MOUNT_XATTR_USER 0x00000010
#define EROFS_MOUNT_POSIX_ACL 0x00000020
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option)
@ -257,6 +262,10 @@ struct erofs_inode {
union {
erofs_blk_t raw_blkaddr;
struct {
unsigned short chunkformat;
unsigned char chunkbits;
};
#ifdef CONFIG_EROFS_FS_ZIP
struct {
unsigned short z_advise;
@ -353,8 +362,15 @@ struct erofs_map_blocks {
/* Flags used by erofs_map_blocks_flatmode() */
#define EROFS_GET_BLOCKS_RAW 0x0001
/*
* Used to get the exact decompressed length, e.g. fiemap (consider lookback
* approach instead if possible since it's more metadata lightweight.)
*/
#define EROFS_GET_BLOCKS_FIEMAP 0x0002
/* zmap.c */
extern const struct iomap_ops z_erofs_iomap_report_ops;
#ifdef CONFIG_EROFS_FS_ZIP
int z_erofs_fill_inode(struct inode *inode);
int z_erofs_map_blocks_iter(struct inode *inode,
@ -371,7 +387,10 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
#endif /* !CONFIG_EROFS_FS_ZIP */
/* data.c */
extern const struct file_operations erofs_file_fops;
struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@ -441,8 +460,7 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
int erofs_try_to_free_cached_page(struct address_space *mapping,
struct page *page);
int erofs_try_to_free_cached_page(struct page *page);
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int len);

View File

@ -245,4 +245,5 @@ const struct inode_operations erofs_dir_iops = {
.getattr = erofs_getattr,
.listxattr = erofs_listxattr,
.get_acl = erofs_get_acl,
.fiemap = erofs_fiemap,
};

View File

@ -11,6 +11,7 @@
#include <linux/crc32c.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/dax.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@ -355,6 +356,8 @@ enum {
Opt_user_xattr,
Opt_acl,
Opt_cache_strategy,
Opt_dax,
Opt_dax_enum,
Opt_err
};
@ -365,14 +368,47 @@ static const struct constant_table erofs_param_cache_strategy[] = {
{}
};
static const struct constant_table erofs_dax_param_enums[] = {
{"always", EROFS_MOUNT_DAX_ALWAYS},
{"never", EROFS_MOUNT_DAX_NEVER},
{}
};
static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_flag_no("user_xattr", Opt_user_xattr),
fsparam_flag_no("acl", Opt_acl),
fsparam_enum("cache_strategy", Opt_cache_strategy,
erofs_param_cache_strategy),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
{}
};
static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
{
#ifdef CONFIG_FS_DAX
struct erofs_fs_context *ctx = fc->fs_private;
switch (mode) {
case EROFS_MOUNT_DAX_ALWAYS:
warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
set_opt(ctx, DAX_ALWAYS);
clear_opt(ctx, DAX_NEVER);
return true;
case EROFS_MOUNT_DAX_NEVER:
set_opt(ctx, DAX_NEVER);
clear_opt(ctx, DAX_ALWAYS);
return true;
default:
DBG_BUGON(1);
return false;
}
#else
errorfc(fc, "dax options not supported");
return false;
#endif
}
static int erofs_fc_parse_param(struct fs_context *fc,
struct fs_parameter *param)
{
@ -412,6 +448,14 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "compression not supported, cache_strategy ignored");
#endif
break;
case Opt_dax:
if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS))
return -EINVAL;
break;
case Opt_dax_enum:
if (!erofs_fc_set_dax_mode(fc, result.uint_32))
return -EINVAL;
break;
default:
return -ENOPARAM;
}
@ -430,7 +474,7 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
DBG_BUGON(mapping->a_ops != &managed_cache_aops);
if (PagePrivate(page))
ret = erofs_try_to_free_cached_page(mapping, page);
ret = erofs_try_to_free_cached_page(page);
return ret;
}
@ -496,10 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
sb->s_fs_info = sbi;
sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
err = erofs_read_superblock(sb);
if (err)
return err;
if (test_opt(ctx, DAX_ALWAYS) &&
!bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) {
errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
clear_opt(ctx, DAX_ALWAYS);
}
sb->s_flags |= SB_RDONLY | SB_NOATIME;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_time_gran = 1;
@ -609,6 +659,7 @@ static void erofs_kill_sb(struct super_block *sb)
sbi = EROFS_SB(sb);
if (!sbi)
return;
fs_put_dax(sbi->dax_dev);
kfree(sbi);
sb->s_fs_info = NULL;
}
@ -711,8 +762,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx;
struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
struct erofs_fs_context *ctx = &sbi->ctx;
#ifdef CONFIG_EROFS_FS_XATTR
if (test_opt(ctx, XATTR_USER))
@ -734,6 +785,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
#endif
if (test_opt(ctx, DAX_ALWAYS))
seq_puts(seq, ",dax=always");
if (test_opt(ctx, DAX_NEVER))
seq_puts(seq, ",dax=never");
return 0;
}

View File

@ -309,7 +309,6 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
struct address_space *const mapping = MNGD_MAPPING(sbi);
int i;
/*
@ -326,7 +325,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
if (!trylock_page(page))
return -EBUSY;
if (page->mapping != mapping)
if (!erofs_page_is_managed(sbi, page))
continue;
/* barrier is implied in the following 'unlock_page' */
@ -337,8 +336,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
return 0;
}
int erofs_try_to_free_cached_page(struct address_space *mapping,
struct page *page)
int erofs_try_to_free_cached_page(struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
int ret = 0; /* 0 - busy */

View File

@ -212,9 +212,34 @@ static unsigned int decode_compactedbits(unsigned int lobits,
return lo;
}
static int get_compacted_la_distance(unsigned int lclusterbits,
unsigned int encodebits,
unsigned int vcnt, u8 *in, int i)
{
const unsigned int lomask = (1 << lclusterbits) - 1;
unsigned int lo, d1 = 0;
u8 type;
DBG_BUGON(i >= vcnt);
do {
lo = decode_compactedbits(lclusterbits, lomask,
in, encodebits * i, &type);
if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
return d1;
++d1;
} while (++i < vcnt);
/* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */
if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT))
d1 += lo - 1;
return d1;
}
static int unpack_compacted_index(struct z_erofs_maprecorder *m,
unsigned int amortizedshift,
unsigned int eofs)
unsigned int eofs, bool lookahead)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
@ -243,6 +268,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
m->type = type;
if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
/* figure out lookahead_distance: delta[1] if needed */
if (lookahead)
m->delta[1] = get_compacted_la_distance(lclusterbits,
encodebits, vcnt, in, i);
if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
if (!big_pcluster) {
DBG_BUGON(1);
@ -313,7 +343,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
}
static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned long lcn)
unsigned long lcn, bool lookahead)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@ -364,11 +394,12 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
err = z_erofs_reload_indexes(m, erofs_blknr(pos));
if (err)
return err;
return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos),
lookahead);
}
static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn)
unsigned int lcn, bool lookahead)
{
const unsigned int datamode = EROFS_I(m->inode)->datalayout;
@ -376,7 +407,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
return legacy_load_cluster_from_disk(m, lcn);
if (datamode == EROFS_INODE_FLAT_COMPRESSION)
return compacted_load_cluster_from_disk(m, lcn);
return compacted_load_cluster_from_disk(m, lcn, lookahead);
return -EINVAL;
}
@ -399,7 +430,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
/* load extent head logical cluster if needed */
lcn -= lookback_distance;
err = z_erofs_load_cluster_from_disk(m, lcn);
err = z_erofs_load_cluster_from_disk(m, lcn, false);
if (err)
return err;
@ -450,7 +481,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
if (m->compressedlcs)
goto out;
err = z_erofs_load_cluster_from_disk(m, lcn);
err = z_erofs_load_cluster_from_disk(m, lcn, false);
if (err)
return err;
@ -498,6 +529,48 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
{
struct inode *inode = m->inode;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_map_blocks *map = m->map;
unsigned int lclusterbits = vi->z_logical_clusterbits;
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
do {
/* handle the last EOF pcluster (no next HEAD lcluster) */
if ((lcn << lclusterbits) >= inode->i_size) {
map->m_llen = inode->i_size - map->m_la;
return 0;
}
err = z_erofs_load_cluster_from_disk(m, lcn, true);
if (err)
return err;
if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
DBG_BUGON(!m->delta[1] &&
m->clusterofs != 1 << lclusterbits);
} else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) {
/* go on until the next HEAD lcluster */
if (lcn != headlcn)
break;
m->delta[1] = 1;
} else {
erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
m->type, lcn, vi->nid);
DBG_BUGON(1);
return -EOPNOTSUPP;
}
lcn += m->delta[1];
} while (m->delta[1]);
map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
return 0;
}
int z_erofs_map_blocks_iter(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
@ -531,7 +604,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false);
if (err)
goto unmap_out;
@ -581,6 +654,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
if (flags & EROFS_GET_BLOCKS_FIEMAP) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
unmap_out:
if (m.kaddr)
kunmap_atomic(m.kaddr);
@ -596,3 +675,41 @@ int z_erofs_map_blocks_iter(struct inode *inode,
DBG_BUGON(err < 0 && err != -ENOMEM);
return err;
}
static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
struct erofs_map_blocks map = { .m_la = offset };
ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
if (map.mpage)
put_page(map.mpage);
if (ret < 0)
return ret;
iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = map.m_la;
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
iomap->addr = map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
/*
* No strict rule how to describe extents for post EOF, yet
* we need do like below. Otherwise, iomap itself will get
* into an endless loop on post EOF.
*/
if (iomap->offset >= inode->i_size)
iomap->length = length + map.m_la - offset;
}
iomap->flags = 0;
return 0;
}
const struct iomap_ops z_erofs_iomap_report_ops = {
.iomap_begin = z_erofs_iomap_begin_report,
};