overlayfs update for 5.13

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQSQHSd0lITzzeNWNm3h3BK/laaZPAUCYIwTsgAKCRDh3BK/laaZ
 PDktAP41eScbCiFzXDRjXw9S7Wfd8HEct0y1p+9BUh8m3VdHfwEA0pDlJWNaJdYW
 nFixPJ5GsAfxo+1ags0vn06CUS/K4gA=
 =QlbJ
 -----END PGP SIGNATURE-----

Merge tag 'ovl-update-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs

Pull overlayfs update from Miklos Szeredi:

 - Fix a regression introduced in 5.2 that resulted in valid overlayfs
   mounts being rejected with ELOOP (Too many levels of symbolic links)

 - Fix bugs found by various tools

 - Miscellaneous improvements and cleanups

* tag 'ovl-update-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs:
  ovl: add debug print to ovl_do_getxattr()
  ovl: invalidate readdir cache on changes to dir with origin
  ovl: allow upperdir inside lowerdir
  ovl: show "userxattr" in the mount data
  ovl: trivial typo fixes in the file inode.c
  ovl: fix misspellings using codespell tool
  ovl: do not copy attr several times
  ovl: remove ovl_map_dev_ino() return value
  ovl: fix error for ovl_fill_super()
  ovl: fix missing revert_creds() on error path
  ovl: fix leaked dentry
  ovl: restrict lower null uuid for "xino=auto"
  ovl: check that upperdir path is not on a read-only mount
  ovl: plumb through flush method
This commit is contained in:
Linus Torvalds 2021-04-30 15:17:08 -07:00
commit d652502ef4
9 changed files with 136 additions and 81 deletions

View File

@ -40,17 +40,17 @@ On 64bit systems, even if all overlay layers are not on the same
underlying filesystem, the same compliant behavior could be achieved
with the "xino" feature. The "xino" feature composes a unique object
identifier from the real object st_ino and an underlying fsid index.
If all underlying filesystems support NFS file handles and export file
handles with 32bit inode number encoding (e.g. ext4), overlay filesystem
will use the high inode number bits for fsid. Even when the underlying
filesystem uses 64bit inode numbers, users can still enable the "xino"
feature with the "-o xino=on" overlay mount option. That is useful for the
case of underlying filesystems like xfs and tmpfs, which use 64bit inode
numbers, but are very unlikely to use the high inode number bits. In case
The "xino" feature uses the high inode number bits for fsid, because the
underlying filesystems rarely use the high inode number bits. In case
the underlying inode number does overflow into the high xino bits, overlay
filesystem will fall back to the non xino behavior for that inode.
The "xino" feature can be enabled with the "-o xino=on" overlay mount option.
If all underlying filesystems support NFS file handles, the value of st_ino
for overlay filesystem objects is not only unique, but also persistent over
the lifetime of the filesystem. The "-o xino=auto" overlay mount option
enables the "xino" feature only if the persistent st_ino requirement is met.
The following table summarizes what can be expected in different overlay
configurations.
@ -66,14 +66,13 @@ Inode properties
| All layers | Y | Y | Y | Y | Y | Y | Y | Y |
| on same fs | | | | | | | | |
+--------------+-----+------+-----+------+--------+--------+--------+-------+
| Layers not | N | Y | Y | N | N | Y | N | Y |
| Layers not | N | N | Y | N | N | Y | N | Y |
| on same fs, | | | | | | | | |
| xino=off | | | | | | | | |
+--------------+-----+------+-----+------+--------+--------+--------+-------+
| xino=on/auto | Y | Y | Y | Y | Y | Y | Y | Y |
| | | | | | | | | |
+--------------+-----+------+-----+------+--------+--------+--------+-------+
| xino=on/auto,| N | Y | Y | N | N | Y | N | Y |
| xino=on/auto,| N | N | Y | N | N | Y | N | Y |
| ino overflow | | | | | | | | |
+--------------+-----+------+-----+------+--------+--------+--------+-------+
@ -81,7 +80,6 @@ Inode properties
/proc files, such as /proc/locks and /proc/self/fdinfo/<fd> of an inotify
file descriptor.
Upper and Lower
---------------
@ -461,7 +459,7 @@ enough free bits in the inode number, then overlayfs will not be able to
guarantee that the values of st_ino and st_dev returned by stat(2) and the
value of d_ino returned by readdir(3) will act like on a normal filesystem.
E.g. the value of st_dev may be different for two objects in the same
overlay filesystem and the value of st_ino for directory objects may not be
overlay filesystem and the value of st_ino for filesystem objects may not be
persistent and could change even while the overlay filesystem is mounted, as
summarized in the `Inode properties`_ table above.
@ -476,7 +474,7 @@ a crash or deadlock.
Offline changes, when the overlay is not mounted, are allowed to the
upper tree. Offline changes to the lower tree are only allowed if the
"metadata only copy up", "inode index", and "redirect_dir" features
"metadata only copy up", "inode index", "xino" and "redirect_dir" features
have not been used. If the lower tree is modified and any of these
features has been used, the behavior of the overlay is undefined,
though it will not result in a crash or deadlock.

View File

@ -932,7 +932,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
static int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
const struct cred *old_cred;
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
/*
@ -943,6 +943,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
if (WARN_ON(disconnected && d_is_dir(dentry)))
return -EIO;
old_cred = ovl_override_creds(dentry->d_sb);
while (!err) {
struct dentry *next;
struct dentry *parent = NULL;

View File

@ -571,6 +571,26 @@ static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in,
remap_flags, op);
}
static int ovl_flush(struct file *file, fl_owner_t id)
{
struct fd real;
const struct cred *old_cred;
int err;
err = ovl_real_fdget(file, &real);
if (err)
return err;
if (real.file->f_op->flush) {
old_cred = ovl_override_creds(file_inode(file)->i_sb);
err = real.file->f_op->flush(real.file, id);
revert_creds(old_cred);
}
fdput(real);
return err;
}
const struct file_operations ovl_file_operations = {
.open = ovl_open,
.release = ovl_release,
@ -581,6 +601,7 @@ const struct file_operations ovl_file_operations = {
.mmap = ovl_mmap,
.fallocate = ovl_fallocate,
.fadvise = ovl_fadvise,
.flush = ovl_flush,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,

View File

@ -97,7 +97,7 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
{
bool samefs = ovl_same_fs(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
@ -110,21 +110,21 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
* which is friendly to du -x.
*/
stat->dev = dentry->d_sb->s_dev;
return 0;
return;
} else if (xinobits) {
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
* (upper fsid is 0). The lowest xinobit is reserved for mapping
* the non-peresistent inode numbers range in case of overflow.
* the non-persistent inode numbers range in case of overflow.
* This way all overlay inode numbers are unique and use the
* overlay st_dev.
*/
if (likely(!(stat->ino >> xinoshift))) {
stat->ino |= ((u64)fsid) << (xinoshift + 1);
stat->dev = dentry->d_sb->s_dev;
return 0;
return;
} else if (ovl_xino_warn(dentry->d_sb)) {
pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
dentry, stat->ino, xinobits);
@ -153,8 +153,6 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid)
*/
stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev;
}
return 0;
}
int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
@ -253,9 +251,7 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
}
err = ovl_map_dev_ino(dentry, stat, fsid);
if (err)
goto out;
ovl_map_dev_ino(dentry, stat, fsid);
/*
* It's probably not worth it to count subdirs to get the
@ -410,7 +406,7 @@ static bool ovl_can_list(struct super_block *sb, const char *s)
if (ovl_is_private_xattr(sb, s))
return false;
/* List all non-trusted xatts */
/* List all non-trusted xattrs */
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
return true;
@ -615,7 +611,7 @@ static const struct address_space_operations ovl_aops = {
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
* inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
* inode lock will use the lockdep annotation ovl_i_mutex_key[depth].
*
* For example, here is a snip from /proc/lockdep_chains after
* dir_iterate of nested overlayfs:

View File

@ -919,6 +919,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
continue;
if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
dput(this);
err = -EPERM;
pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
goto out_put;

View File

@ -186,7 +186,12 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry,
size_t size)
{
const char *name = ovl_xattr(ofs, ox);
return vfs_getxattr(&init_user_ns, dentry, name, value, size);
int err = vfs_getxattr(&init_user_ns, dentry, name, value, size);
int len = (value && err > 0) ? err : 0;
pr_debug("getxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n",
dentry, name, min(len, 48), value, size, err);
return err;
}
static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
@ -319,9 +324,6 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
enum ovl_xattr ox, const void *value, size_t size,
int xerr);
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
void ovl_set_flag(unsigned long flag, struct inode *inode);
void ovl_clear_flag(unsigned long flag, struct inode *inode);
bool ovl_test_flag(unsigned long flag, struct inode *inode);
bool ovl_inuse_trylock(struct dentry *dentry);
void ovl_inuse_unlock(struct dentry *dentry);
bool ovl_is_inuse(struct dentry *dentry);
@ -335,6 +337,21 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, struct dentry *dentry,
int padding);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
{
set_bit(flag, &OVL_I(inode)->flags);
}
static inline void ovl_clear_flag(unsigned long flag, struct inode *inode)
{
clear_bit(flag, &OVL_I(inode)->flags);
}
static inline bool ovl_test_flag(unsigned long flag, struct inode *inode)
{
return test_bit(flag, &OVL_I(inode)->flags);
}
static inline bool ovl_is_impuredir(struct super_block *sb,
struct dentry *dentry)
{
@ -439,6 +456,18 @@ int ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
/*
* Can we iterate real dir directly?
*
* Non-merge dir may contain whiteouts from a time it was a merge upper, before
* lower dir was removed under it and possibly before it was rotated from upper
* to lower layer.
*/
static inline bool ovl_dir_is_real(struct dentry *dir)
{
return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
}
/* inode.c */
int ovl_set_nlink_upper(struct dentry *dentry);
int ovl_set_nlink_lower(struct dentry *dentry);

View File

@ -319,18 +319,6 @@ static inline int ovl_dir_read(struct path *realpath,
return err;
}
/*
* Can we iterate real dir directly?
*
* Non-merge dir may contain whiteouts from a time it was a merge upper, before
* lower dir was removed under it and possibly before it was rotated from upper
* to lower layer.
*/
static bool ovl_dir_is_real(struct dentry *dir)
{
return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir));
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;

View File

@ -380,6 +380,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
ofs->config.metacopy ? "on" : "off");
if (ofs->config.ovl_volatile)
seq_puts(m, ",volatile");
if (ofs->config.userxattr)
seq_puts(m, ",userxattr");
return 0;
}
@ -945,6 +947,16 @@ static int ovl_lower_dir(const char *name, struct path *path,
pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
name);
}
/*
* Decoding origin file handle is required for persistent st_ino.
* Without persistent st_ino, xino=auto falls back to xino=off.
*/
if (ofs->config.xino == OVL_XINO_AUTO &&
ofs->config.upperdir && !fh_type) {
ofs->config.xino = OVL_XINO_OFF;
pr_warn("fs on '%s' does not support file handles, falling back to xino=off.\n",
name);
}
/* Check if lower fs has 32bit inode numbers */
if (fh_type != FILEID_INO32_GEN)
@ -1042,9 +1054,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
}
err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags);
if (!err)
ovl_copyattr(ovl_inode_real(inode), inode);
return err;
out_acl_release:
@ -1185,8 +1194,8 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
if (err)
goto out;
/* Upper fs should not be r/o */
if (sb_rdonly(upperpath->mnt->mnt_sb)) {
/* Upperdir path should not be r/o */
if (__mnt_is_readonly(upperpath->mnt)) {
pr_err("upper fs is r/o, try multi-lower layers mount\n");
err = -EINVAL;
goto out;
@ -1401,9 +1410,19 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
err = ovl_do_setxattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE, "0", 1);
if (err) {
ofs->noxattr = true;
ofs->config.index = false;
ofs->config.metacopy = false;
pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n");
if (ofs->config.index || ofs->config.metacopy) {
ofs->config.index = false;
ofs->config.metacopy = false;
pr_warn("upper fs does not support xattr, falling back to index=off,metacopy=off.\n");
}
/*
* xattr support is required for persistent st_ino.
* Without persistent st_ino, xino=auto falls back to xino=off.
*/
if (ofs->config.xino == OVL_XINO_AUTO) {
ofs->config.xino = OVL_XINO_OFF;
pr_warn("upper fs does not support xattr, falling back to xino=off.\n");
}
err = 0;
} else {
ovl_do_removexattr(ofs, ofs->workdir, OVL_XATTR_OPAQUE);
@ -1580,7 +1599,8 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
* user opted-in to one of the new features that require following the
* lower inode of non-dir upper.
*/
if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino &&
if (!ofs->config.index && !ofs->config.metacopy &&
ofs->config.xino != OVL_XINO_ON &&
uuid_is_null(uuid))
return false;
@ -1609,6 +1629,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
dev_t dev;
int err;
bool bad_uuid = false;
bool warn = false;
for (i = 0; i < ofs->numfs; i++) {
if (ofs->fs[i].sb == sb)
@ -1617,13 +1638,20 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
bad_uuid = true;
if (ofs->config.xino == OVL_XINO_AUTO) {
ofs->config.xino = OVL_XINO_OFF;
warn = true;
}
if (ofs->config.index || ofs->config.nfs_export) {
ofs->config.index = false;
ofs->config.nfs_export = false;
pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
warn = true;
}
if (warn) {
pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n",
uuid_is_null(&sb->s_uuid) ? "null" :
"conflicting",
path->dentry);
path->dentry, ovl_xino_str[ofs->config.xino]);
}
}
@ -1826,7 +1854,8 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
* - upper/work dir of any overlayfs instance
*/
static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
struct dentry *dentry, const char *name)
struct dentry *dentry, const char *name,
bool is_lower)
{
struct dentry *next = dentry, *parent;
int err = 0;
@ -1838,7 +1867,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs,
/* Walk back ancestors to root (inclusive) looking for traps */
while (!err && parent != next) {
if (ovl_lookup_trap_inode(sb, parent)) {
if (is_lower && ovl_lookup_trap_inode(sb, parent)) {
err = -ELOOP;
pr_err("overlapping %s path\n", name);
} else if (ovl_is_inuse(parent)) {
@ -1864,7 +1893,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
if (ovl_upper_mnt(ofs)) {
err = ovl_check_layer(sb, ofs, ovl_upper_mnt(ofs)->mnt_root,
"upperdir");
"upperdir", false);
if (err)
return err;
@ -1875,7 +1904,8 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
* workbasedir. In that case, we already have their traps in
* inode cache and we will catch that case on lookup.
*/
err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir");
err = ovl_check_layer(sb, ofs, ofs->workbasedir, "workdir",
false);
if (err)
return err;
}
@ -1883,7 +1913,7 @@ static int ovl_check_overlapping_layers(struct super_block *sb,
for (i = 1; i < ofs->numlayer; i++) {
err = ovl_check_layer(sb, ofs,
ofs->layers[i].mnt->mnt_root,
"lowerdir");
"lowerdir", true);
if (err)
return err;
}
@ -1952,6 +1982,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!ofs)
goto out;
err = -ENOMEM;
ofs->creator_cred = cred = prepare_creds();
if (!cred)
goto out_err;
@ -1980,6 +2011,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!splitlower)
goto out_err;
err = -EINVAL;
numlower = ovl_split_lowerdirs(splitlower);
if (numlower > OVL_MAX_STACK) {
pr_err("too many lower directories, limit is %d\n",
@ -1987,6 +2019,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
goto out_err;
}
err = -ENOMEM;
layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL);
if (!layers)
goto out_err;
@ -2013,6 +2046,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (ofs->config.upperdir) {
struct super_block *upper_sb;
err = -EINVAL;
if (!ofs->config.workdir) {
pr_err("missing 'workdir'\n");
goto out_err;

View File

@ -214,7 +214,7 @@ const struct ovl_layer *ovl_layer_lower(struct dentry *dentry)
/*
* ovl_dentry_lower() could return either a data dentry or metacopy dentry
* dependig on what is stored in lowerstack[0]. At times we need to find
* depending on what is stored in lowerstack[0]. At times we need to find
* lower dentry which has data (and not metacopy dentry). This helper
* returns the lower data dentry.
*/
@ -422,18 +422,20 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
}
}
static void ovl_dentry_version_inc(struct dentry *dentry, bool impurity)
static void ovl_dir_version_inc(struct dentry *dentry, bool impurity)
{
struct inode *inode = d_inode(dentry);
WARN_ON(!inode_is_locked(inode));
WARN_ON(!d_is_dir(dentry));
/*
* Version is used by readdir code to keep cache consistent. For merge
* dirs all changes need to be noted. For non-merge dirs, cache only
* contains impure (ones which have been copied up and have origins)
* entries, so only need to note changes to impure entries.
* Version is used by readdir code to keep cache consistent.
* For merge dirs (or dirs with origin) all changes need to be noted.
* For non-merge dirs, cache contains only impure entries (i.e. ones
* which have been copied up and have origins), so only need to note
* changes to impure entries.
*/
if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity)
if (!ovl_dir_is_real(dentry) || impurity)
OVL_I(inode)->version++;
}
@ -442,7 +444,7 @@ void ovl_dir_modified(struct dentry *dentry, bool impurity)
/* Copy mtime/ctime */
ovl_copyattr(d_inode(ovl_dentry_upper(dentry)), d_inode(dentry));
ovl_dentry_version_inc(dentry, impurity);
ovl_dir_version_inc(dentry, impurity);
}
u64 ovl_dentry_version_get(struct dentry *dentry)
@ -638,21 +640,6 @@ int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry)
return err;
}
void ovl_set_flag(unsigned long flag, struct inode *inode)
{
set_bit(flag, &OVL_I(inode)->flags);
}
void ovl_clear_flag(unsigned long flag, struct inode *inode)
{
clear_bit(flag, &OVL_I(inode)->flags);
}
bool ovl_test_flag(unsigned long flag, struct inode *inode)
{
return test_bit(flag, &OVL_I(inode)->flags);
}
/**
* Caller must hold a reference to inode to prevent it from being freed while
* it is marked inuse.