mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-12 08:48:48 +00:00
orangefs: kernel side caching and executable bugfix
This allows OrangeFS to utilize the dcache and adds an in kernel attribute cache. We previously used the user side client for this purpose. We see a modest performance increase on small file operations. For example, without the cache, compiling coreutils takes about 17 minutes. With the patch and a 50 millisecond timeout for dcache_timeout_msecs and getattr_timeout_msecs (the default), compiling coreutils takes about 6 minutes 20 seconds. On the same hardware, compiling coreutils on an xfs filesystem takes 90 seconds. We see similar improvements with mdtest and a test involving writing, reading, and deleting a large number of small files. Interested parties can review more data at the following URL. https://docs.google.com/spreadsheets/d/1v4aUeppKexIbRMz_Yn9k4eaM3uy2KCaPoe_93YKWOtA/pubhtml The eventual goal of this is to allow getdents to turn into a readdirplus to the OrangeFS server. The cache will be filled then, which should provide a performance benefit to the common case of readdir followed by getattr on each entry (i.e. ls -l). This also fixes a bug. When orangefs_inode_permission was added, it did not collect i_size from the OrangeFS server, since this presses an unnecessary load on the OrangeFS server. However, it left a case where i_size is never initialized. Then running an executable could fail. With this patch, size is always collected to be inserted into the cache. Thus the bug disappears. If this patch is not accepted during this merge window, we will send a one-line band-aid for this bug instead. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2 iQIxBAABCAAbBQJXoPhPFBxtYXJ0aW5Ab21uaWJvbmQuY29tAAoJEPVzxHxs4+kh wCsQALUKnyoJzhHAmEoxYZGUPchgBS2yyWQJGP3ViqE8GbVubVG2NsLbluO1u5en /pdOPDXeij7pPGzdWk6wt0tXvM3oGJ3UPRi9ofEtU3XHnb4srX6XHBeG3ZHHZH0A 91NPnMsmlBQvivBbVbjYrgXMKXz/UCQot7Y5iP7o9Gmick5tQqhRB21GcSCMeD7k ycrl61EA+GYDZOlzVspF2LJ52MhIXuT1T9ev66dLQWv8p6pMmpA4kda3Dwvqn/cE GGTeElq2PBGdhGapK4axGfRAW55997j9k6gcxLvFdA99ayAQ3+0hzXw4rNzcdabA ESUOe4riaYEaGEd686Mtd2w9hxvr1bOqkyRCKNnko90JJnqfGsgLfetpasG8CgUo n8VGxjimuCamBDf1+0ZzUs0Pj8q+U1QNQtHJi9QR/sNnNds/52k9OXV2r4MG+suU MAie5eD0Py6GzP9pOrAmuFbBkgd7Ag3EbiTjR1lKRpBR626inL/jM60XFfaF4P5g YOXC+VtJuVR88emIxqJ9ebdEy9+2yfkyinrLH9xZNctoz7KIoMhsmWb2bONKJDnx ngoqVKyH5opw6dKRkbTCM1A2mq8NntDvU6yeyHYJ2NXPXgARf9rSUIJ0RvR3oxdh Fqt5QyYHYDPZBuQn9XUV7t+VhAOFCbAPUDMMlifZUNx7icbj =rGmf -----END PGP SIGNATURE----- Merge tag 'for-linus-v4.8' of git://github.com/martinbrandenburg/linux Pull orangefs update from Martin Brandenburg: "Kernel side caching and executable bugfix This allows OrangeFS to utilize the dcache and adds an in kernel attribute cache. We previously used the user side client for this purpose. We see a modest performance increase on small file operations. For example, without the cache, compiling coreutils takes about 17 minutes. With the patch and a 50 millisecond timeout for dcache_timeout_msecs and getattr_timeout_msecs (the default), compiling coreutils takes about 6 minutes 20 seconds. On the same hardware, compiling coreutils on an xfs filesystem takes 90 seconds. We see similar improvements with mdtest and a test involving writing, reading, and deleting a large number of small files. Interested parties can review more data at the following URL. https://docs.google.com/spreadsheets/d/1v4aUeppKexIbRMz_Yn9k4eaM3uy2KCaPoe_93YKWOtA/pubhtml The eventual goal of this is to allow getdents to turn into a readdirplus to the OrangeFS server. The cache will be filled then, which should provide a performance benefit to the common case of readdir followed by getattr on each entry (i.e. ls -l). This also fixes a bug. When orangefs_inode_permission was added, it did not collect i_size from the OrangeFS server, since this presses an unnecessary load on the OrangeFS server. However, it left a case where i_size is never initialized. Then running an executable could fail. With this patch, size is always collected to be inserted into the cache. Thus the bug disappears. If this patch is not accepted during this merge window, we will send a one-line band-aid for this bug instead" * tag 'for-linus-v4.8' of git://github.com/martinbrandenburg/linux: Orangefs: update orangefs.txt orangefs: Account for jiffies wraparound. orangefs: Change default dcache and getattr timeout to 50 msec. orangefs: Allow dcache and getattr cache time to be configured. orangefs: Cache getattr results. orangefs: Use d_time to avoid excessive lookups
This commit is contained in:
commit
8cbdd85bda
@ -403,4 +403,46 @@ Readdir responses initialize the fifth element io_array like this:
|
||||
from out_downcall member of global variable
|
||||
vfs_request
|
||||
|
||||
Orangefs exploits the dcache in order to avoid sending redundant
|
||||
requests to userspace. We keep object inode attributes up-to-date with
|
||||
orangefs_inode_getattr. Orangefs_inode_getattr uses two arguments to
|
||||
help it decide whether or not to update an inode: "new" and "bypass".
|
||||
Orangefs keeps private data in an object's inode that includes a short
|
||||
timeout value, getattr_time, which allows any iteration of
|
||||
orangefs_inode_getattr to know how long it has been since the inode was
|
||||
updated. When the object is not new (new == 0) and the bypass flag is not
|
||||
set (bypass == 0) orangefs_inode_getattr returns without updating the inode
|
||||
if getattr_time has not timed out. Getattr_time is updated each time the
|
||||
inode is updated.
|
||||
|
||||
Creation of a new object (file, dir, sym-link) includes the evaluation of
|
||||
its pathname, resulting in a negative directory entry for the object.
|
||||
A new inode is allocated and associated with the dentry, turning it from
|
||||
a negative dentry into a "productive full member of society". Orangefs
|
||||
obtains the new inode from Linux with new_inode() and associates
|
||||
the inode with the dentry by sending the pair back to Linux with
|
||||
d_instantiate().
|
||||
|
||||
The evaluation of a pathname for an object resolves to its corresponding
|
||||
dentry. If there is no corresponding dentry, one is created for it in
|
||||
the dcache. Whenever a dentry is modified or verified Orangefs stores a
|
||||
short timeout value in the dentry's d_time, and the dentry will be trusted
|
||||
for that amount of time. Orangefs is a network filesystem, and objects
|
||||
can potentially change out-of-band with any particular Orangefs kernel module
|
||||
instance, so trusting a dentry is risky. The alternative to trusting
|
||||
dentries is to always obtain the needed information from userspace - at
|
||||
least a trip to the client-core, maybe to the servers. Obtaining information
|
||||
from a dentry is cheap, obtaining it from userspace is relatively expensive,
|
||||
hence the motivation to use the dentry when possible.
|
||||
|
||||
The timeout values d_time and getattr_time are jiffy based, and the
|
||||
code is designed to avoid the jiffy-wrap problem:
|
||||
|
||||
"In general, if the clock may have wrapped around more than once, there
|
||||
is no way to tell how much time has elapsed. However, if the times t1
|
||||
and t2 are known to be fairly close, we can reliably compute the
|
||||
difference in a way that takes into account the possibility that the
|
||||
clock may have wrapped between times."
|
||||
|
||||
from course notes by instructor Andy Wang
|
||||
|
||||
|
@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
|
||||
}
|
||||
}
|
||||
|
||||
dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
|
||||
ret = 1;
|
||||
out_release_op:
|
||||
op_release(new_op);
|
||||
@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (time_before(jiffies, dentry->d_time))
|
||||
return 1;
|
||||
|
||||
if (flags & LOOKUP_RCU)
|
||||
return -ECHILD;
|
||||
|
||||
|
@ -262,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt,
|
||||
"orangefs_getattr: called on %s\n",
|
||||
dentry->d_name.name);
|
||||
|
||||
ret = orangefs_inode_getattr(inode, 0, 1);
|
||||
ret = orangefs_inode_getattr(inode, 0, 0);
|
||||
if (ret == 0) {
|
||||
generic_fillattr(inode, kstat);
|
||||
|
||||
@ -384,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
|
||||
if (!inode || !(inode->i_state & I_NEW))
|
||||
return inode;
|
||||
|
||||
error = orangefs_inode_getattr(inode, 1, 0);
|
||||
error = orangefs_inode_getattr(inode, 1, 1);
|
||||
if (error) {
|
||||
iget_failed(inode);
|
||||
return ERR_PTR(error);
|
||||
@ -429,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
|
||||
orangefs_set_inode(inode, ref);
|
||||
inode->i_ino = hash; /* needed for stat etc */
|
||||
|
||||
error = orangefs_inode_getattr(inode, 1, 0);
|
||||
error = orangefs_inode_getattr(inode, 1, 1);
|
||||
if (error)
|
||||
goto out_iput;
|
||||
|
||||
|
@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir,
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
unlock_new_inode(inode);
|
||||
dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
|
||||
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
|
||||
|
||||
gossip_debug(GOSSIP_NAME_DEBUG,
|
||||
"%s: dentry instantiated for %s\n",
|
||||
@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
goto out;
|
||||
}
|
||||
|
||||
dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
|
||||
|
||||
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
|
||||
if (IS_ERR(inode)) {
|
||||
gossip_debug(GOSSIP_NAME_DEBUG,
|
||||
@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
|
||||
goto out;
|
||||
}
|
||||
|
||||
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
|
||||
|
||||
gossip_debug(GOSSIP_NAME_DEBUG,
|
||||
"%s:%s:%d "
|
||||
"Found good inode [%lu] with count [%d]\n",
|
||||
@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir,
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
unlock_new_inode(inode);
|
||||
dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
|
||||
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
|
||||
|
||||
gossip_debug(GOSSIP_NAME_DEBUG,
|
||||
"Inode (Symlink) %pU -> %s\n",
|
||||
@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
unlock_new_inode(inode);
|
||||
dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
|
||||
ORANGEFS_I(inode)->getattr_time = jiffies - 1;
|
||||
|
||||
gossip_debug(GOSSIP_NAME_DEBUG,
|
||||
"Inode (Directory) %pU -> %s\n",
|
||||
@ -408,6 +418,8 @@ static int orangefs_rename(struct inode *old_dir,
|
||||
"orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
|
||||
old_dentry, new_dentry, d_count(new_dentry));
|
||||
|
||||
ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
|
||||
|
||||
new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
|
||||
if (!new_op)
|
||||
return -EINVAL;
|
||||
|
@ -246,6 +246,8 @@ struct orangefs_inode_s {
|
||||
* with this object
|
||||
*/
|
||||
unsigned long pinode_flags;
|
||||
|
||||
unsigned long getattr_time;
|
||||
};
|
||||
|
||||
#define P_ATIME_FLAG 0
|
||||
@ -527,7 +529,7 @@ int orangefs_inode_setxattr(struct inode *inode,
|
||||
size_t size,
|
||||
int flags);
|
||||
|
||||
int orangefs_inode_getattr(struct inode *inode, int new, int size);
|
||||
int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
|
||||
|
||||
int orangefs_inode_check_changed(struct inode *inode);
|
||||
|
||||
@ -546,6 +548,8 @@ extern struct mutex request_mutex;
|
||||
extern int debug;
|
||||
extern int op_timeout_secs;
|
||||
extern int slot_timeout_secs;
|
||||
extern int dcache_timeout_msecs;
|
||||
extern int getattr_timeout_msecs;
|
||||
extern struct list_head orangefs_superblocks;
|
||||
extern spinlock_t orangefs_superblocks_lock;
|
||||
extern struct list_head orangefs_request_list;
|
||||
|
@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
|
||||
unsigned int kernel_mask_set_mod_init; /* implicitly false */
|
||||
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
|
||||
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
|
||||
int dcache_timeout_msecs = 50;
|
||||
int getattr_timeout_msecs = 50;
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("ORANGEFS Development Team");
|
||||
|
@ -61,10 +61,21 @@
|
||||
* Slots are requested and waited for,
|
||||
* the wait times out after slot_timeout_secs.
|
||||
*
|
||||
* What: /sys/fs/orangefs/dcache_timeout_msecs
|
||||
* Date: Jul 2016
|
||||
* Contact: Martin Brandenburg <martin@omnibond.com>
|
||||
* Description:
|
||||
* Time lookup is valid in milliseconds.
|
||||
*
|
||||
* What: /sys/fs/orangefs/getattr_timeout_msecs
|
||||
* Date: Jul 2016
|
||||
* Contact: Martin Brandenburg <martin@omnibond.com>
|
||||
* Description:
|
||||
* Time getattr is valid in milliseconds.
|
||||
*
|
||||
* What: /sys/fs/orangefs/acache/...
|
||||
* Date: Jun 2015
|
||||
* Contact: Mike Marshall <hubcap@omnibond.com>
|
||||
* Contact: Martin Brandenburg <martin@omnibond.com>
|
||||
* Description:
|
||||
* Attribute cache configurable settings.
|
||||
*
|
||||
@ -117,6 +128,8 @@ struct orangefs_obj {
|
||||
int perf_history_size;
|
||||
int perf_time_interval_secs;
|
||||
int slot_timeout_secs;
|
||||
int dcache_timeout_msecs;
|
||||
int getattr_timeout_msecs;
|
||||
};
|
||||
|
||||
struct acache_orangefs_obj {
|
||||
@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
|
||||
"%d\n",
|
||||
slot_timeout_secs);
|
||||
goto out;
|
||||
} else if (!strcmp(orangefs_attr->attr.name,
|
||||
"dcache_timeout_msecs")) {
|
||||
rc = scnprintf(buf,
|
||||
PAGE_SIZE,
|
||||
"%d\n",
|
||||
dcache_timeout_msecs);
|
||||
goto out;
|
||||
} else if (!strcmp(orangefs_attr->attr.name,
|
||||
"getattr_timeout_msecs")) {
|
||||
rc = scnprintf(buf,
|
||||
PAGE_SIZE,
|
||||
"%d\n",
|
||||
getattr_timeout_msecs);
|
||||
goto out;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
|
||||
} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
|
||||
rc = kstrtoint(buf, 0, &slot_timeout_secs);
|
||||
goto out;
|
||||
} else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
|
||||
rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
|
||||
goto out;
|
||||
} else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
|
||||
rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
|
||||
goto out;
|
||||
} else {
|
||||
goto out;
|
||||
}
|
||||
@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute =
|
||||
static struct orangefs_attribute slot_timeout_secs_attribute =
|
||||
__ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
|
||||
|
||||
static struct orangefs_attribute dcache_timeout_msecs_attribute =
|
||||
__ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
|
||||
|
||||
static struct orangefs_attribute getattr_timeout_msecs_attribute =
|
||||
__ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
|
||||
|
||||
static struct orangefs_attribute perf_counter_reset_attribute =
|
||||
__ATTR(perf_counter_reset,
|
||||
0664,
|
||||
@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
|
||||
static struct attribute *orangefs_default_attrs[] = {
|
||||
&op_timeout_secs_attribute.attr,
|
||||
&slot_timeout_secs_attribute.attr,
|
||||
&dcache_timeout_msecs_attribute.attr,
|
||||
&getattr_timeout_msecs_attribute.attr,
|
||||
&perf_counter_reset_attribute.attr,
|
||||
&perf_history_size_attribute.attr,
|
||||
&perf_time_interval_secs_attribute.attr,
|
||||
|
@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int orangefs_inode_getattr(struct inode *inode, int new, int size)
|
||||
int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
|
||||
{
|
||||
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
|
||||
struct orangefs_kernel_op_s *new_op;
|
||||
@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
|
||||
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
|
||||
get_khandle_from_ino(inode));
|
||||
|
||||
if (!new && !bypass) {
|
||||
if (time_before(jiffies, orangefs_inode->getattr_time))
|
||||
return 0;
|
||||
}
|
||||
|
||||
new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
|
||||
if (!new_op)
|
||||
return -ENOMEM;
|
||||
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
|
||||
new_op->upcall.req.getattr.mask = size ?
|
||||
ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
|
||||
new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
|
||||
|
||||
ret = service_operation(new_op, __func__,
|
||||
get_interruptible_flag(inode));
|
||||
@ -287,7 +291,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
|
||||
case S_IFREG:
|
||||
inode->i_flags = orangefs_inode_flags(&new_op->
|
||||
downcall.resp.getattr.attributes);
|
||||
if (size) {
|
||||
inode_size = (loff_t)new_op->
|
||||
downcall.resp.getattr.attributes.size;
|
||||
rounded_up_size =
|
||||
@ -300,7 +303,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
|
||||
inode->i_blocks =
|
||||
(unsigned long)(rounded_up_size / 512);
|
||||
spin_unlock(&inode->i_lock);
|
||||
}
|
||||
break;
|
||||
case S_IFDIR:
|
||||
inode->i_size = PAGE_SIZE;
|
||||
@ -345,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
|
||||
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
|
||||
orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
|
||||
|
||||
orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
|
||||
ret = 0;
|
||||
out:
|
||||
op_release(new_op);
|
||||
@ -418,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
|
||||
ClearMtimeFlag(orangefs_inode);
|
||||
ClearCtimeFlag(orangefs_inode);
|
||||
ClearModeFlag(orangefs_inode);
|
||||
orangefs_inode->getattr_time = jiffies - 1;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -207,14 +207,6 @@ typedef __s64 ORANGEFS_offset;
|
||||
ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
|
||||
ORANGEFS_ATTR_SYS_BLKSIZE)
|
||||
|
||||
#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
|
||||
(ORANGEFS_ATTR_SYS_COMMON_ALL | \
|
||||
ORANGEFS_ATTR_SYS_LNK_TARGET | \
|
||||
ORANGEFS_ATTR_SYS_DFILE_COUNT | \
|
||||
ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
|
||||
ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
|
||||
ORANGEFS_ATTR_SYS_BLKSIZE)
|
||||
|
||||
#define ORANGEFS_XATTR_REPLACE 0x2
|
||||
#define ORANGEFS_XATTR_CREATE 0x1
|
||||
#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
|
||||
|
Loading…
x
Reference in New Issue
Block a user