linux-stable/block/ioprio.c
Damien Le Moal eca2040972 scsi: block: ioprio: Clean up interface definition
The I/O priority user interface defines the 16-bits ioprio values as the
combination of the upper 3-bits for an I/O priority class and the lower
13-bits as priority data. However, the kernel only uses the lower 3-bits of
the priority data to define priority levels for the RT and BE priority
classes. The data part of an ioprio value is completely ignored for the
IDLE and NONE classes. This is enforced by checks done in
ioprio_check_cap(), which is called for all paths that allow defining an
I/O priority for I/Os: the per-context ioprio_set() system call, aio
interface and io_uring interface.

Clarify this fact in the uapi ioprio.h header file and introduce the
IOPRIO_PRIO_LEVEL_MASK and IOPRIO_PRIO_LEVEL() macros for users to define
and get priority levels in an ioprio value. The coarser macro
IOPRIO_PRIO_DATA() is retained for backward compatibility with old
applications already using it. There is no functional change introduced
with this.

In-kernel users of the IOPRIO_PRIO_DATA() macro which are explicitly
handling I/O priority data as a priority level are modified to use the new
IOPRIO_PRIO_LEVEL() macro without any functional change. Since f2fs is the
only user of this macro not explicitly using that value as a priority
level, it is left unchanged.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Link: https://lore.kernel.org/r/20230511011356.227789-2-nks@flawful.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
2023-05-22 17:05:18 -04:00

279 lines
5.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.rst
*
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int level = IOPRIO_PRIO_LEVEL(ioprio);
switch (class) {
case IOPRIO_CLASS_RT:
/*
* Originally this only checked for CAP_SYS_ADMIN,
* which was implicitly allowed for pid 0 by security
* modules such as SELinux. Make sure we check
* CAP_SYS_ADMIN first to avoid a denial/avc for
* possibly missing CAP_SYS_NICE permission.
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
if (level >= IOPRIO_NR_LEVELS)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
if (level)
return -EINVAL;
break;
default:
return -EINVAL;
}
return 0;
}
SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
{
struct task_struct *p, *g;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret;
ret = ioprio_check_cap(ioprio);
if (ret)
return ret;
ret = -ESRCH;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret) {
read_unlock(&tasklist_lock);
goto out;
}
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!uid_valid(uid))
break;
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), uid) ||
!task_pid_vnr(p))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
goto free_uid;
}
free_uid:
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
out:
rcu_read_unlock();
return ret;
}
/*
* If the task has set an I/O priority, use that. Otherwise, return
* the default I/O priority.
*
* Expected to be called for current task or with task_lock() held to keep
* io_context stable.
*/
int __get_task_ioprio(struct task_struct *p)
{
struct io_context *ioc = p->io_context;
int prio;
if (p != current)
lockdep_assert_held(&p->alloc_lock);
if (ioc)
prio = ioc->ioprio;
else
prio = IOPRIO_DEFAULT;
if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
task_nice_ioprio(p));
return prio;
}
EXPORT_SYMBOL_GPL(__get_task_ioprio);
static int get_task_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
task_lock(p);
ret = __get_task_ioprio(p);
task_unlock(p);
out:
return ret;
}
/*
* Return raw IO priority value as set by userspace. We use this for
* ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
* also so that userspace can distinguish unset IO priority (which just gets
* overriden based on task's nice value) from IO priority set to some value.
*/
static int get_task_raw_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
else
ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
return min(aprio, bprio);
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret = -ESRCH;
int tmpio;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), user->uid) ||
!task_pid_vnr(p))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
}
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
rcu_read_unlock();
return ret;
}