2019-06-01 10:08:50 +02:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2008-03-07 21:55:58 -05:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2008 Intel Corporation
|
|
|
|
* Author: Matthew Wilcox <willy@linux.intel.com>
|
|
|
|
*
|
2008-04-11 15:23:52 -04:00
|
|
|
* This file implements counting semaphores.
|
|
|
|
* A counting semaphore may be acquired 'n' times before sleeping.
|
|
|
|
* See mutex.c for single-acquisition sleeping locks which enforce
|
|
|
|
* rules which allow code to be debugged more easily.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some notes on the implementation:
|
|
|
|
*
|
|
|
|
* The spinlock controls access to the other members of the semaphore.
|
|
|
|
* down_trylock() and up() can be called from interrupt context, so we
|
|
|
|
* have to disable interrupts when taking the lock. It turns out various
|
|
|
|
* parts of the kernel expect to be able to use down() on a semaphore in
|
|
|
|
* interrupt context when they know it will succeed, so we have to use
|
|
|
|
* irqsave variants for down(), down_interruptible() and down_killable()
|
|
|
|
* too.
|
|
|
|
*
|
|
|
|
* The ->count variable represents how many more tasks can acquire this
|
|
|
|
* semaphore. If it's zero, there may be tasks waiting on the wait_list.
|
2008-03-07 21:55:58 -05:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/kernel.h>
|
2011-05-23 14:51:41 -04:00
|
|
|
#include <linux/export.h>
|
2008-03-07 21:55:58 -05:00
|
|
|
#include <linux/sched.h>
|
2017-02-08 18:51:35 +01:00
|
|
|
#include <linux/sched/debug.h>
|
2008-03-07 21:55:58 -05:00
|
|
|
#include <linux/semaphore.h>
|
|
|
|
#include <linux/spinlock.h>
|
2008-05-12 21:21:15 +02:00
|
|
|
#include <linux/ftrace.h>
|
2022-03-22 11:57:09 -07:00
|
|
|
#include <trace/events/lock.h>
|
2008-03-07 21:55:58 -05:00
|
|
|
|
|
|
|
static noinline void __down(struct semaphore *sem);
|
|
|
|
static noinline int __down_interruptible(struct semaphore *sem);
|
2008-03-14 13:19:33 -04:00
|
|
|
static noinline int __down_killable(struct semaphore *sem);
|
2014-09-03 03:17:24 -07:00
|
|
|
static noinline int __down_timeout(struct semaphore *sem, long timeout);
|
2008-03-07 21:55:58 -05:00
|
|
|
static noinline void __up(struct semaphore *sem);
|
|
|
|
|
2008-04-11 15:23:52 -04:00
|
|
|
/**
|
|
|
|
* down - acquire the semaphore
|
|
|
|
* @sem: the semaphore to be acquired
|
|
|
|
*
|
|
|
|
* Acquires the semaphore. If no more tasks are allowed to acquire the
|
|
|
|
* semaphore, calling this function will put the task to sleep until the
|
|
|
|
* semaphore is released.
|
|
|
|
*
|
|
|
|
* Use of this function is deprecated, please use down_interruptible() or
|
|
|
|
* down_killable() instead.
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
void __sched down(struct semaphore *sem)
|
2008-03-07 21:55:58 -05:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2021-08-09 10:12:15 +08:00
|
|
|
might_sleep();
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (likely(sem->count > 0))
|
|
|
|
sem->count--;
|
|
|
|
else
|
2008-03-07 21:55:58 -05:00
|
|
|
__down(sem);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(down);
|
|
|
|
|
2008-04-11 15:23:52 -04:00
|
|
|
/**
|
|
|
|
* down_interruptible - acquire the semaphore unless interrupted
|
|
|
|
* @sem: the semaphore to be acquired
|
|
|
|
*
|
|
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
|
|
* If the sleep is interrupted by a signal, this function will return -EINTR.
|
|
|
|
* If the semaphore is successfully acquired, this function returns 0.
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
int __sched down_interruptible(struct semaphore *sem)
|
2008-03-07 21:55:58 -05:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int result = 0;
|
|
|
|
|
2021-08-09 10:12:15 +08:00
|
|
|
might_sleep();
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (likely(sem->count > 0))
|
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 11:53:48 +02:00
|
|
|
sem->count--;
|
2008-05-10 20:43:22 -07:00
|
|
|
else
|
|
|
|
result = __down_interruptible(sem);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-07 21:55:58 -05:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(down_interruptible);
|
|
|
|
|
2008-04-11 15:23:52 -04:00
|
|
|
/**
|
|
|
|
* down_killable - acquire the semaphore unless killed
|
|
|
|
* @sem: the semaphore to be acquired
|
|
|
|
*
|
|
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
|
|
* If the sleep is interrupted by a fatal signal, this function will return
|
|
|
|
* -EINTR. If the semaphore is successfully acquired, this function returns
|
|
|
|
* 0.
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
int __sched down_killable(struct semaphore *sem)
|
2008-03-14 13:19:33 -04:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int result = 0;
|
|
|
|
|
2021-08-09 10:12:15 +08:00
|
|
|
might_sleep();
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (likely(sem->count > 0))
|
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 11:53:48 +02:00
|
|
|
sem->count--;
|
2008-05-10 20:43:22 -07:00
|
|
|
else
|
|
|
|
result = __down_killable(sem);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-14 13:19:33 -04:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(down_killable);
|
|
|
|
|
2008-03-07 21:55:58 -05:00
|
|
|
/**
|
|
|
|
* down_trylock - try to acquire the semaphore, without waiting
|
|
|
|
* @sem: the semaphore to be acquired
|
|
|
|
*
|
2012-03-03 16:18:47 +02:00
|
|
|
* Try to acquire the semaphore atomically. Returns 0 if the semaphore has
|
2021-02-25 17:21:10 -08:00
|
|
|
* been acquired successfully or 1 if it cannot be acquired.
|
2008-03-07 21:55:58 -05:00
|
|
|
*
|
|
|
|
* NOTE: This return value is inverted from both spin_trylock and
|
|
|
|
* mutex_trylock! Be careful about this when converting code.
|
|
|
|
*
|
|
|
|
* Unlike mutex_trylock, this function can be used from interrupt context,
|
|
|
|
* and the semaphore can be released by any task or interrupt.
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
int __sched down_trylock(struct semaphore *sem)
|
2008-03-07 21:55:58 -05:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int count;
|
|
|
|
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-03-07 21:55:58 -05:00
|
|
|
count = sem->count - 1;
|
|
|
|
if (likely(count >= 0))
|
|
|
|
sem->count = count;
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-07 21:55:58 -05:00
|
|
|
|
|
|
|
return (count < 0);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(down_trylock);
|
|
|
|
|
2008-04-11 15:23:52 -04:00
|
|
|
/**
|
|
|
|
* down_timeout - acquire the semaphore within a specified time
|
|
|
|
* @sem: the semaphore to be acquired
|
2014-09-03 03:17:24 -07:00
|
|
|
* @timeout: how long to wait before failing
|
2008-04-11 15:23:52 -04:00
|
|
|
*
|
|
|
|
* Attempts to acquire the semaphore. If no more tasks are allowed to
|
|
|
|
* acquire the semaphore, calling this function will put the task to sleep.
|
|
|
|
* If the semaphore is not released within the specified number of jiffies,
|
|
|
|
* this function returns -ETIME. It returns 0 if the semaphore was acquired.
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
int __sched down_timeout(struct semaphore *sem, long timeout)
|
2008-03-14 13:43:13 -04:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int result = 0;
|
|
|
|
|
2021-08-09 10:12:15 +08:00
|
|
|
might_sleep();
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (likely(sem->count > 0))
|
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 11:53:48 +02:00
|
|
|
sem->count--;
|
2008-05-10 20:43:22 -07:00
|
|
|
else
|
2014-09-03 03:17:24 -07:00
|
|
|
result = __down_timeout(sem, timeout);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-14 13:43:13 -04:00
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(down_timeout);
|
|
|
|
|
2008-04-11 15:23:52 -04:00
|
|
|
/**
|
|
|
|
* up - release the semaphore
|
|
|
|
* @sem: the semaphore to release
|
|
|
|
*
|
|
|
|
* Release the semaphore. Unlike mutexes, up() may be called from any
|
|
|
|
* context and even by tasks which have never called down().
|
|
|
|
*/
|
2022-09-08 17:08:03 -07:00
|
|
|
void __sched up(struct semaphore *sem)
|
2008-03-07 21:55:58 -05:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irqsave(&sem->lock, flags);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (likely(list_empty(&sem->wait_list)))
|
|
|
|
sem->count++;
|
|
|
|
else
|
2008-03-07 21:55:58 -05:00
|
|
|
__up(sem);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irqrestore(&sem->lock, flags);
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(up);
|
|
|
|
|
|
|
|
/* Functions for the contended case */
|
|
|
|
|
|
|
|
struct semaphore_waiter {
|
|
|
|
struct list_head list;
|
|
|
|
struct task_struct *task;
|
2013-04-30 15:28:33 -07:00
|
|
|
bool up;
|
2008-03-07 21:55:58 -05:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2008-03-14 13:43:13 -04:00
|
|
|
* Because this function is inlined, the 'state' parameter will be
|
|
|
|
* constant, and thus optimised away by the compiler. Likewise the
|
|
|
|
* 'timeout' parameter for the cases without timeouts.
|
2008-03-07 21:55:58 -05:00
|
|
|
*/
|
2022-03-22 11:57:09 -07:00
|
|
|
static inline int __sched ___down_common(struct semaphore *sem, long state,
|
2008-03-14 13:43:13 -04:00
|
|
|
long timeout)
|
2008-03-07 21:55:58 -05:00
|
|
|
{
|
|
|
|
struct semaphore_waiter waiter;
|
|
|
|
|
semaphore: fix
Yanmin Zhang reported:
| Comparing with kernel 2.6.25, AIM7 (use tmpfs) has more th
| regression under 2.6.26-rc1 on my 8-core stoakley, 16-core tigerton,
| and Itanium Montecito. Bisect located the patch below:
|
| 64ac24e738823161693bf791f87adc802cf529ff is first bad commit
| commit 64ac24e738823161693bf791f87adc802cf529ff
| Author: Matthew Wilcox <matthew@wil.cx>
| Date: Fri Mar 7 21:55:58 2008 -0500
|
| Generic semaphore implementation
|
| After I manually reverted the patch against 2.6.26-rc1 while fixing
| lots of conflicts/errors, aim7 regression became less than 2%.
i reproduced the AIM7 workload and can confirm Yanmin's findings that
-.26-rc1 regresses over .25 - by over 67% here.
Looking at the workload i found and fixed what i believe to be the real
bug causing the AIM7 regression: it was inefficient wakeup / scheduling
/ locking behavior of the new generic semaphore code, causing suboptimal
performance.
The problem comes from the following code. The new semaphore code does
this on down():
spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
spin_unlock_irqrestore(&sem->lock, flags);
and this on up():
spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
spin_unlock_irqrestore(&sem->lock, flags);
where __up() does:
list_del(&waiter->list);
waiter->up = 1;
wake_up_process(waiter->task);
and where __down() does this in essence:
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
waiter.up = 0;
for (;;) {
[...]
spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
the fastpath looks good and obvious, but note the following property of
the contended path: if there's a task on the ->wait_list, the up() of
the current owner will "pass over" ownership to that waiting task, in a
wake-one manner, via the waiter->up flag and by removing the waiter from
the wait list.
That is all and fine in principle, but as implemented in
kernel/semaphore.c it also creates a nasty, hidden source of contention!
The contention comes from the following property of the new semaphore
code: the new owner owns the semaphore exclusively, even if it is not
running yet.
So if the old owner, even if just a few instructions later, does a
down() [lock_kernel()] again, it will be blocked and will have to wait
on the new owner to eventually be scheduled (possibly on another CPU)!
Or if another task gets to lock_kernel() sooner than the "new owner"
scheduled, it will be blocked unnecessarily and for a very long time
when there are 2000 tasks running.
I.e. the implementation of the new semaphores code does wake-one and
lock ownership in a very restrictive way - it does not allow
opportunistic re-locking of the lock at all and keeps the scheduler from
picking task order intelligently.
This kind of scheduling, with 2000 AIM7 processes running, creates awful
cross-scheduling between those 2000 tasks, causes reduced parallelism, a
throttled runqueue length and a lot of idle time. With increasing number
of CPUs it causes an exponentially worse behavior in AIM7, as the chance
for a newly woken new-owner task to actually run anytime soon is less
and less likely.
Note that it takes just a tiny bit of contention for the 'new-semaphore
catastrophy' to happen: the wakeup latencies get added to whatever small
contention there is, and quickly snowball out of control!
I believe Yanmin's findings and numbers support this analysis too.
The best fix for this problem is to use the same scheduling logic that
the kernel/mutex.c code uses: keep the wake-one behavior (that is OK and
wanted because we do not want to over-schedule), but also allow
opportunistic locking of the lock even if a wakee is already "in
flight".
The patch below implements this new logic. With this patch applied the
AIM7 regression is largely fixed on my quad testbox:
# v2.6.25 vanilla:
..................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 56096.4 91 207.5 789.7 0.4675
2000 55894.4 94 208.2 792.7 0.4658
# v2.6.26-rc1-166-gc0a1811 vanilla:
...................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 33230.6 83 350.3 784.5 0.2769
2000 31778.1 86 366.3 783.6 0.2648
# v2.6.26-rc1-166-gc0a1811 + semaphore-speedup:
...............................................
Tasks Jobs/Min JTI Real CPU Jobs/sec/task
2000 55707.1 92 209.0 795.6 0.4642
2000 55704.4 96 209.0 796.0 0.4642
i.e. a 67% speedup. We are now back to within 1% of the v2.6.25
performance levels and have zero idle time during the test, as expected.
Btw., interactivity also improved dramatically with the fix - for
example console-switching became almost instantaneous during this
workload (which after all is running 2000 tasks at once!), without the
patch it was stuck for a minute at times.
There's another nice side-effect of this speedup patch, the new generic
semaphore code got even smaller:
text data bss dec hex filename
1241 0 0 1241 4d9 semaphore.o.before
1207 0 0 1207 4b7 semaphore.o.after
(because the waiter.up complication got removed.)
Longer-term we should look into using the mutex code for the generic
semaphore code as well - but i's not easy due to legacies and it's
outside of the scope of v2.6.26 and outside the scope of this patch as
well.
Bisected-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-05-08 11:53:48 +02:00
|
|
|
list_add_tail(&waiter.list, &sem->wait_list);
|
2017-01-03 13:43:13 -08:00
|
|
|
waiter.task = current;
|
2013-04-30 15:28:33 -07:00
|
|
|
waiter.up = false;
|
2008-03-07 21:55:58 -05:00
|
|
|
|
|
|
|
for (;;) {
|
2017-01-03 13:43:13 -08:00
|
|
|
if (signal_pending_state(state, current))
|
2008-05-10 20:43:22 -07:00
|
|
|
goto interrupted;
|
2013-04-30 15:28:32 -07:00
|
|
|
if (unlikely(timeout <= 0))
|
2008-05-10 20:43:22 -07:00
|
|
|
goto timed_out;
|
sched/core: Remove set_task_state()
This is a nasty interface and setting the state of a foreign task must
not be done. As of the following commit:
be628be0956 ("bcache: Make gc wakeup sane, remove set_task_state()")
... everyone in the kernel calls set_task_state() with current, allowing
the helper to be removed.
However, as the comment indicates, it is still around for those archs
where computing current is more expensive than using a pointer, at least
in theory. An important arch that is affected is arm64, however this has
been addressed now [1] and performance is up to par making no difference
with either calls.
Of all the callers, if any, it's the locking bits that would care most
about this -- ie: we end up passing a tsk pointer to a lot of the lock
slowpath, and setting ->state on that. The following numbers are based
on two tests: a custom ad-hoc microbenchmark that just measures
latencies (for ~65 million calls) between get_task_state() vs
get_current_state().
Secondly for a higher overview, an unlink microbenchmark was used,
which pounds on a single file with open, close,unlink combos with
increasing thread counts (up to 4x ncpus). While the workload is quite
unrealistic, it does contend a lot on the inode mutex or now rwsem.
[1] https://lkml.kernel.org/r/1483468021-8237-1-git-send-email-mark.rutland@arm.com
== 1. x86-64 ==
Avg runtime set_task_state(): 601 msecs
Avg runtime set_current_state(): 552 msecs
vanilla dirty
Hmean unlink1-processes-2 36089.26 ( 0.00%) 38977.33 ( 8.00%)
Hmean unlink1-processes-5 28555.01 ( 0.00%) 29832.55 ( 4.28%)
Hmean unlink1-processes-8 37323.75 ( 0.00%) 44974.57 ( 20.50%)
Hmean unlink1-processes-12 43571.88 ( 0.00%) 44283.01 ( 1.63%)
Hmean unlink1-processes-21 34431.52 ( 0.00%) 38284.45 ( 11.19%)
Hmean unlink1-processes-30 34813.26 ( 0.00%) 37975.17 ( 9.08%)
Hmean unlink1-processes-48 37048.90 ( 0.00%) 39862.78 ( 7.59%)
Hmean unlink1-processes-79 35630.01 ( 0.00%) 36855.30 ( 3.44%)
Hmean unlink1-processes-110 36115.85 ( 0.00%) 39843.91 ( 10.32%)
Hmean unlink1-processes-141 32546.96 ( 0.00%) 35418.52 ( 8.82%)
Hmean unlink1-processes-172 34674.79 ( 0.00%) 36899.21 ( 6.42%)
Hmean unlink1-processes-203 37303.11 ( 0.00%) 36393.04 ( -2.44%)
Hmean unlink1-processes-224 35712.13 ( 0.00%) 36685.96 ( 2.73%)
== 2. ppc64le ==
Avg runtime set_task_state(): 938 msecs
Avg runtime set_current_state: 940 msecs
vanilla dirty
Hmean unlink1-processes-2 19269.19 ( 0.00%) 30704.50 ( 59.35%)
Hmean unlink1-processes-5 20106.15 ( 0.00%) 21804.15 ( 8.45%)
Hmean unlink1-processes-8 17496.97 ( 0.00%) 17243.28 ( -1.45%)
Hmean unlink1-processes-12 14224.15 ( 0.00%) 17240.21 ( 21.20%)
Hmean unlink1-processes-21 14155.66 ( 0.00%) 15681.23 ( 10.78%)
Hmean unlink1-processes-30 14450.70 ( 0.00%) 15995.83 ( 10.69%)
Hmean unlink1-processes-48 16945.57 ( 0.00%) 16370.42 ( -3.39%)
Hmean unlink1-processes-79 15788.39 ( 0.00%) 14639.27 ( -7.28%)
Hmean unlink1-processes-110 14268.48 ( 0.00%) 14377.40 ( 0.76%)
Hmean unlink1-processes-141 14023.65 ( 0.00%) 16271.69 ( 16.03%)
Hmean unlink1-processes-172 13417.62 ( 0.00%) 16067.55 ( 19.75%)
Hmean unlink1-processes-203 15293.08 ( 0.00%) 15440.40 ( 0.96%)
Hmean unlink1-processes-234 13719.32 ( 0.00%) 16190.74 ( 18.01%)
Hmean unlink1-processes-265 16400.97 ( 0.00%) 16115.22 ( -1.74%)
Hmean unlink1-processes-296 14388.60 ( 0.00%) 16216.13 ( 12.70%)
Hmean unlink1-processes-320 15771.85 ( 0.00%) 15905.96 ( 0.85%)
x86-64 (known to be fast for get_current()/this_cpu_read_stable() caching)
and ppc64 (with paca) show similar improvements in the unlink microbenches.
The small delta for ppc64 (2ms), does not represent the gains on the unlink
runs. In the case of x86, there was a decent amount of variation in the
latency runs, but always within a 20 to 50ms increase), ppc was more constant.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: mark.rutland@arm.com
Link: http://lkml.kernel.org/r/1483479794-14013-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-01-03 13:43:14 -08:00
|
|
|
__set_current_state(state);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_unlock_irq(&sem->lock);
|
2008-03-14 13:43:13 -04:00
|
|
|
timeout = schedule_timeout(timeout);
|
2010-02-24 09:50:22 +01:00
|
|
|
raw_spin_lock_irq(&sem->lock);
|
2008-05-10 20:43:22 -07:00
|
|
|
if (waiter.up)
|
|
|
|
return 0;
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
|
2008-05-10 20:43:22 -07:00
|
|
|
timed_out:
|
|
|
|
list_del(&waiter.list);
|
|
|
|
return -ETIME;
|
|
|
|
|
|
|
|
interrupted:
|
2008-03-07 21:55:58 -05:00
|
|
|
list_del(&waiter.list);
|
2008-05-10 20:43:22 -07:00
|
|
|
return -EINTR;
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
|
2022-03-22 11:57:09 -07:00
|
|
|
static inline int __sched __down_common(struct semaphore *sem, long state,
|
|
|
|
long timeout)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
trace_contention_begin(sem, 0);
|
|
|
|
ret = ___down_common(sem, state, timeout);
|
|
|
|
trace_contention_end(sem, ret);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-03-07 21:55:58 -05:00
|
|
|
static noinline void __sched __down(struct semaphore *sem)
|
|
|
|
{
|
2008-03-14 13:43:13 -04:00
|
|
|
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int __sched __down_interruptible(struct semaphore *sem)
|
|
|
|
{
|
2008-03-14 13:43:13 -04:00
|
|
|
return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|
|
|
|
|
2008-03-14 13:19:33 -04:00
|
|
|
static noinline int __sched __down_killable(struct semaphore *sem)
|
|
|
|
{
|
2008-03-14 13:43:13 -04:00
|
|
|
return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
|
|
|
|
}
|
|
|
|
|
2014-09-03 03:17:24 -07:00
|
|
|
static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
|
2008-03-14 13:43:13 -04:00
|
|
|
{
|
2014-09-03 03:17:24 -07:00
|
|
|
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
|
2008-03-14 13:19:33 -04:00
|
|
|
}
|
|
|
|
|
2008-03-07 21:55:58 -05:00
|
|
|
static noinline void __sched __up(struct semaphore *sem)
|
|
|
|
{
|
2008-03-14 14:35:22 -04:00
|
|
|
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
|
|
|
|
struct semaphore_waiter, list);
|
2008-05-10 20:43:22 -07:00
|
|
|
list_del(&waiter->list);
|
2013-04-30 15:28:33 -07:00
|
|
|
waiter->up = true;
|
2008-03-14 14:35:22 -04:00
|
|
|
wake_up_process(waiter->task);
|
2008-03-07 21:55:58 -05:00
|
|
|
}
|