mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-07 13:43:51 +00:00
eca2040972
The I/O priority user interface defines the 16-bits ioprio values as the combination of the upper 3-bits for an I/O priority class and the lower 13-bits as priority data. However, the kernel only uses the lower 3-bits of the priority data to define priority levels for the RT and BE priority classes. The data part of an ioprio value is completely ignored for the IDLE and NONE classes. This is enforced by checks done in ioprio_check_cap(), which is called for all paths that allow defining an I/O priority for I/Os: the per-context ioprio_set() system call, aio interface and io_uring interface. Clarify this fact in the uapi ioprio.h header file and introduce the IOPRIO_PRIO_LEVEL_MASK and IOPRIO_PRIO_LEVEL() macros for users to define and get priority levels in an ioprio value. The coarser macro IOPRIO_PRIO_DATA() is retained for backward compatibility with old applications already using it. There is no functional change introduced with this. In-kernel users of the IOPRIO_PRIO_DATA() macro which are explicitly handling I/O priority data as a priority level are modified to use the new IOPRIO_PRIO_LEVEL() macro without any functional change. Since f2fs is the only user of this macro not explicitly using that value as a priority level, it is left unchanged. Signed-off-by: Damien Le Moal <dlemoal@kernel.org> Reviewed-by: Hannes Reinecke <hare@suse.de> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com> Link: https://lore.kernel.org/r/20230511011356.227789-2-nks@flawful.org Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
279 lines
5.8 KiB
C
279 lines
5.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* fs/ioprio.c
|
|
*
|
|
* Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
|
|
*
|
|
* Helper functions for setting/querying io priorities of processes. The
|
|
* system calls closely mimmick getpriority/setpriority, see the man page for
|
|
* those. The prio argument is a composite of prio class and prio data, where
|
|
* the data argument has meaning within that class. The standard scheduling
|
|
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
|
|
* being the lowest.
|
|
*
|
|
* IOW, setting BE scheduling class with prio 2 is done ala:
|
|
*
|
|
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
|
|
*
|
|
* ioprio_set(PRIO_PROCESS, pid, prio);
|
|
*
|
|
* See also Documentation/block/ioprio.rst
|
|
*
|
|
*/
|
|
#include <linux/gfp.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/ioprio.h>
|
|
#include <linux/cred.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/capability.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/security.h>
|
|
#include <linux/pid_namespace.h>
|
|
|
|
int ioprio_check_cap(int ioprio)
|
|
{
|
|
int class = IOPRIO_PRIO_CLASS(ioprio);
|
|
int level = IOPRIO_PRIO_LEVEL(ioprio);
|
|
|
|
switch (class) {
|
|
case IOPRIO_CLASS_RT:
|
|
/*
|
|
* Originally this only checked for CAP_SYS_ADMIN,
|
|
* which was implicitly allowed for pid 0 by security
|
|
* modules such as SELinux. Make sure we check
|
|
* CAP_SYS_ADMIN first to avoid a denial/avc for
|
|
* possibly missing CAP_SYS_NICE permission.
|
|
*/
|
|
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
|
|
return -EPERM;
|
|
fallthrough;
|
|
/* rt has prio field too */
|
|
case IOPRIO_CLASS_BE:
|
|
if (level >= IOPRIO_NR_LEVELS)
|
|
return -EINVAL;
|
|
break;
|
|
case IOPRIO_CLASS_IDLE:
|
|
break;
|
|
case IOPRIO_CLASS_NONE:
|
|
if (level)
|
|
return -EINVAL;
|
|
break;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
|
|
{
|
|
struct task_struct *p, *g;
|
|
struct user_struct *user;
|
|
struct pid *pgrp;
|
|
kuid_t uid;
|
|
int ret;
|
|
|
|
ret = ioprio_check_cap(ioprio);
|
|
if (ret)
|
|
return ret;
|
|
|
|
ret = -ESRCH;
|
|
rcu_read_lock();
|
|
switch (which) {
|
|
case IOPRIO_WHO_PROCESS:
|
|
if (!who)
|
|
p = current;
|
|
else
|
|
p = find_task_by_vpid(who);
|
|
if (p)
|
|
ret = set_task_ioprio(p, ioprio);
|
|
break;
|
|
case IOPRIO_WHO_PGRP:
|
|
if (!who)
|
|
pgrp = task_pgrp(current);
|
|
else
|
|
pgrp = find_vpid(who);
|
|
|
|
read_lock(&tasklist_lock);
|
|
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
|
|
ret = set_task_ioprio(p, ioprio);
|
|
if (ret) {
|
|
read_unlock(&tasklist_lock);
|
|
goto out;
|
|
}
|
|
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
|
|
read_unlock(&tasklist_lock);
|
|
|
|
break;
|
|
case IOPRIO_WHO_USER:
|
|
uid = make_kuid(current_user_ns(), who);
|
|
if (!uid_valid(uid))
|
|
break;
|
|
if (!who)
|
|
user = current_user();
|
|
else
|
|
user = find_user(uid);
|
|
|
|
if (!user)
|
|
break;
|
|
|
|
for_each_process_thread(g, p) {
|
|
if (!uid_eq(task_uid(p), uid) ||
|
|
!task_pid_vnr(p))
|
|
continue;
|
|
ret = set_task_ioprio(p, ioprio);
|
|
if (ret)
|
|
goto free_uid;
|
|
}
|
|
free_uid:
|
|
if (who)
|
|
free_uid(user);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
out:
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* If the task has set an I/O priority, use that. Otherwise, return
|
|
* the default I/O priority.
|
|
*
|
|
* Expected to be called for current task or with task_lock() held to keep
|
|
* io_context stable.
|
|
*/
|
|
int __get_task_ioprio(struct task_struct *p)
|
|
{
|
|
struct io_context *ioc = p->io_context;
|
|
int prio;
|
|
|
|
if (p != current)
|
|
lockdep_assert_held(&p->alloc_lock);
|
|
if (ioc)
|
|
prio = ioc->ioprio;
|
|
else
|
|
prio = IOPRIO_DEFAULT;
|
|
|
|
if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
|
|
prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
|
|
task_nice_ioprio(p));
|
|
return prio;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__get_task_ioprio);
|
|
|
|
static int get_task_ioprio(struct task_struct *p)
|
|
{
|
|
int ret;
|
|
|
|
ret = security_task_getioprio(p);
|
|
if (ret)
|
|
goto out;
|
|
task_lock(p);
|
|
ret = __get_task_ioprio(p);
|
|
task_unlock(p);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Return raw IO priority value as set by userspace. We use this for
|
|
* ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
|
|
* also so that userspace can distinguish unset IO priority (which just gets
|
|
* overriden based on task's nice value) from IO priority set to some value.
|
|
*/
|
|
static int get_task_raw_ioprio(struct task_struct *p)
|
|
{
|
|
int ret;
|
|
|
|
ret = security_task_getioprio(p);
|
|
if (ret)
|
|
goto out;
|
|
task_lock(p);
|
|
if (p->io_context)
|
|
ret = p->io_context->ioprio;
|
|
else
|
|
ret = IOPRIO_DEFAULT;
|
|
task_unlock(p);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int ioprio_best(unsigned short aprio, unsigned short bprio)
|
|
{
|
|
return min(aprio, bprio);
|
|
}
|
|
|
|
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
|
|
{
|
|
struct task_struct *g, *p;
|
|
struct user_struct *user;
|
|
struct pid *pgrp;
|
|
kuid_t uid;
|
|
int ret = -ESRCH;
|
|
int tmpio;
|
|
|
|
rcu_read_lock();
|
|
switch (which) {
|
|
case IOPRIO_WHO_PROCESS:
|
|
if (!who)
|
|
p = current;
|
|
else
|
|
p = find_task_by_vpid(who);
|
|
if (p)
|
|
ret = get_task_raw_ioprio(p);
|
|
break;
|
|
case IOPRIO_WHO_PGRP:
|
|
if (!who)
|
|
pgrp = task_pgrp(current);
|
|
else
|
|
pgrp = find_vpid(who);
|
|
read_lock(&tasklist_lock);
|
|
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
|
|
tmpio = get_task_ioprio(p);
|
|
if (tmpio < 0)
|
|
continue;
|
|
if (ret == -ESRCH)
|
|
ret = tmpio;
|
|
else
|
|
ret = ioprio_best(ret, tmpio);
|
|
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
|
|
read_unlock(&tasklist_lock);
|
|
|
|
break;
|
|
case IOPRIO_WHO_USER:
|
|
uid = make_kuid(current_user_ns(), who);
|
|
if (!who)
|
|
user = current_user();
|
|
else
|
|
user = find_user(uid);
|
|
|
|
if (!user)
|
|
break;
|
|
|
|
for_each_process_thread(g, p) {
|
|
if (!uid_eq(task_uid(p), user->uid) ||
|
|
!task_pid_vnr(p))
|
|
continue;
|
|
tmpio = get_task_ioprio(p);
|
|
if (tmpio < 0)
|
|
continue;
|
|
if (ret == -ESRCH)
|
|
ret = tmpio;
|
|
else
|
|
ret = ioprio_best(ret, tmpio);
|
|
}
|
|
|
|
if (who)
|
|
free_uid(user);
|
|
break;
|
|
default:
|
|
ret = -EINVAL;
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
return ret;
|
|
}
|