mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-10 15:10:38 +00:00
0ddad21d3e
This makes the pipe code use separate wait-queues and exclusive waiting for readers and writers, avoiding a nasty thundering herd problem when there are lots of readers waiting for data on a pipe (or, less commonly, lots of writers waiting for a pipe to have space). While this isn't a common occurrence in the traditional "use a pipe as a data transport" case, where you typically only have a single reader and a single writer process, there is one common special case: using a pipe as a source of "locking tokens" rather than for data communication. In particular, the GNU make jobserver code ends up using a pipe as a way to limit parallelism, where each job consumes a token by reading a byte from the jobserver pipe, and releases the token by writing a byte back to the pipe. This pattern is fairly traditional on Unix, and works very well, but will waste a lot of time waking up a lot of processes when only a single reader needs to be woken up when a writer releases a new token. A simplified test-case of just this pipe interaction is to create 64 processes, and then pass a single token around between them (this test-case also intentionally passes another token that gets ignored to test the "wake up next" logic too, in case anybody wonders about it): #include <unistd.h> int main(int argc, char **argv) { int fd[2], counters[2]; pipe(fd); counters[0] = 0; counters[1] = -1; write(fd[1], counters, sizeof(counters)); /* 64 processes */ fork(); fork(); fork(); fork(); fork(); fork(); do { int i; read(fd[0], &i, sizeof(i)); if (i < 0) continue; counters[0] = i+1; write(fd[1], counters, (1+(i & 1)) *sizeof(int)); } while (counters[0] < 1000000); return 0; } and in a perfect world, passing that token around should only cause one context switch per transfer, when the writer of a token causes a directed wakeup of just a single reader. But with the "writer wakes all readers" model we traditionally had, on my test box the above case causes more than an order of magnitude more scheduling: instead of the expected ~1M context switches, "perf stat" shows 231,852.37 msec task-clock # 15.857 CPUs utilized 11,250,961 context-switches # 0.049 M/sec 616,304 cpu-migrations # 0.003 M/sec 1,648 page-faults # 0.007 K/sec 1,097,903,998,514 cycles # 4.735 GHz 120,781,778,352 instructions # 0.11 insn per cycle 27,997,056,043 branches # 120.754 M/sec 283,581,233 branch-misses # 1.01% of all branches 14.621273891 seconds time elapsed 0.018243000 seconds user 3.611468000 seconds sys before this commit. After this commit, I get 5,229.55 msec task-clock # 3.072 CPUs utilized 1,212,233 context-switches # 0.232 M/sec 103,951 cpu-migrations # 0.020 M/sec 1,328 page-faults # 0.254 K/sec 21,307,456,166 cycles # 4.074 GHz 12,947,819,999 instructions # 0.61 insn per cycle 2,881,985,678 branches # 551.096 M/sec 64,267,015 branch-misses # 2.23% of all branches 1.702148350 seconds time elapsed 0.004868000 seconds user 0.110786000 seconds sys instead. Much better. [ Note! This kernel improvement seems to be very good at triggering a race condition in the make jobserver (in GNU make 4.2.1) for me. It's a long known bug that was fixed back in June 2017 by GNU make commit b552b0525198 ("[SV 51159] Use a non-blocking read with pselect to avoid hangs."). But there wasn't a new release of GNU make until 4.3 on Jan 19 2020, so a number of distributions may still have the buggy version. Some have backported the fix to their 4.2.1 release, though, and even without the fix it's quite timing-dependent whether the bug actually is hit. ] Josh Triplett says: "I've been hammering on your pipe fix patch (switching to exclusive wait queues) for a month or so, on several different systems, and I've run into no issues with it. The patch *substantially* improves parallel build times on large (~100 CPU) systems, both with parallel make and with other things that use make's pipe-based jobserver. All current distributions (including stable and long-term stable distributions) have versions of GNU make that no longer have the jobserver bug" Tested-by: Josh Triplett <josh@joshtriplett.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
247 lines
7.5 KiB
C
247 lines
7.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_PIPE_FS_I_H
|
|
#define _LINUX_PIPE_FS_I_H
|
|
|
|
#define PIPE_DEF_BUFFERS 16
|
|
|
|
#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
|
|
#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
|
|
#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */
|
|
#define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */
|
|
|
|
/**
|
|
* struct pipe_buffer - a linux kernel pipe buffer
|
|
* @page: the page containing the data for the pipe buffer
|
|
* @offset: offset of data inside the @page
|
|
* @len: length of data inside the @page
|
|
* @ops: operations associated with this buffer. See @pipe_buf_operations.
|
|
* @flags: pipe buffer flags. See above.
|
|
* @private: private data owned by the ops.
|
|
**/
|
|
struct pipe_buffer {
|
|
struct page *page;
|
|
unsigned int offset, len;
|
|
const struct pipe_buf_operations *ops;
|
|
unsigned int flags;
|
|
unsigned long private;
|
|
};
|
|
|
|
/**
|
|
* struct pipe_inode_info - a linux kernel pipe
|
|
* @mutex: mutex protecting the whole thing
|
|
* @wait: reader/writer wait point in case of empty/full pipe
|
|
* @head: The point of buffer production
|
|
* @tail: The point of buffer consumption
|
|
* @max_usage: The maximum number of slots that may be used in the ring
|
|
* @ring_size: total number of buffers (should be a power of 2)
|
|
* @tmp_page: cached released page
|
|
* @readers: number of current readers of this pipe
|
|
* @writers: number of current writers of this pipe
|
|
* @files: number of struct file referring this pipe (protected by ->i_lock)
|
|
* @r_counter: reader counter
|
|
* @w_counter: writer counter
|
|
* @fasync_readers: reader side fasync
|
|
* @fasync_writers: writer side fasync
|
|
* @bufs: the circular array of pipe buffers
|
|
* @user: the user who created this pipe
|
|
**/
|
|
struct pipe_inode_info {
|
|
struct mutex mutex;
|
|
wait_queue_head_t rd_wait, wr_wait;
|
|
unsigned int head;
|
|
unsigned int tail;
|
|
unsigned int max_usage;
|
|
unsigned int ring_size;
|
|
unsigned int readers;
|
|
unsigned int writers;
|
|
unsigned int files;
|
|
unsigned int r_counter;
|
|
unsigned int w_counter;
|
|
struct page *tmp_page;
|
|
struct fasync_struct *fasync_readers;
|
|
struct fasync_struct *fasync_writers;
|
|
struct pipe_buffer *bufs;
|
|
struct user_struct *user;
|
|
};
|
|
|
|
/*
|
|
* Note on the nesting of these functions:
|
|
*
|
|
* ->confirm()
|
|
* ->steal()
|
|
*
|
|
* That is, ->steal() must be called on a confirmed buffer.
|
|
* See below for the meaning of each operation. Also see kerneldoc
|
|
* in fs/pipe.c for the pipe and generic variants of these hooks.
|
|
*/
|
|
struct pipe_buf_operations {
|
|
/*
|
|
* ->confirm() verifies that the data in the pipe buffer is there
|
|
* and that the contents are good. If the pages in the pipe belong
|
|
* to a file system, we may need to wait for IO completion in this
|
|
* hook. Returns 0 for good, or a negative error value in case of
|
|
* error.
|
|
*/
|
|
int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
|
|
|
|
/*
|
|
* When the contents of this pipe buffer has been completely
|
|
* consumed by a reader, ->release() is called.
|
|
*/
|
|
void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
|
|
|
|
/*
|
|
* Attempt to take ownership of the pipe buffer and its contents.
|
|
* ->steal() returns 0 for success, in which case the contents
|
|
* of the pipe (the buf->page) is locked and now completely owned
|
|
* by the caller. The page may then be transferred to a different
|
|
* mapping, the most often used case is insertion into different
|
|
* file address space cache.
|
|
*/
|
|
int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
|
|
|
|
/*
|
|
* Get a reference to the pipe buffer.
|
|
*/
|
|
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
|
|
};
|
|
|
|
/**
|
|
* pipe_empty - Return true if the pipe is empty
|
|
* @head: The pipe ring head pointer
|
|
* @tail: The pipe ring tail pointer
|
|
*/
|
|
static inline bool pipe_empty(unsigned int head, unsigned int tail)
|
|
{
|
|
return head == tail;
|
|
}
|
|
|
|
/**
|
|
* pipe_occupancy - Return number of slots used in the pipe
|
|
* @head: The pipe ring head pointer
|
|
* @tail: The pipe ring tail pointer
|
|
*/
|
|
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
|
|
{
|
|
return head - tail;
|
|
}
|
|
|
|
/**
|
|
* pipe_full - Return true if the pipe is full
|
|
* @head: The pipe ring head pointer
|
|
* @tail: The pipe ring tail pointer
|
|
* @limit: The maximum amount of slots available.
|
|
*/
|
|
static inline bool pipe_full(unsigned int head, unsigned int tail,
|
|
unsigned int limit)
|
|
{
|
|
return pipe_occupancy(head, tail) >= limit;
|
|
}
|
|
|
|
/**
|
|
* pipe_space_for_user - Return number of slots available to userspace
|
|
* @head: The pipe ring head pointer
|
|
* @tail: The pipe ring tail pointer
|
|
* @pipe: The pipe info structure
|
|
*/
|
|
static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail,
|
|
struct pipe_inode_info *pipe)
|
|
{
|
|
unsigned int p_occupancy, p_space;
|
|
|
|
p_occupancy = pipe_occupancy(head, tail);
|
|
if (p_occupancy >= pipe->max_usage)
|
|
return 0;
|
|
p_space = pipe->ring_size - p_occupancy;
|
|
if (p_space > pipe->max_usage)
|
|
p_space = pipe->max_usage;
|
|
return p_space;
|
|
}
|
|
|
|
/**
|
|
* pipe_buf_get - get a reference to a pipe_buffer
|
|
* @pipe: the pipe that the buffer belongs to
|
|
* @buf: the buffer to get a reference to
|
|
*
|
|
* Return: %true if the reference was successfully obtained.
|
|
*/
|
|
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
|
|
struct pipe_buffer *buf)
|
|
{
|
|
return buf->ops->get(pipe, buf);
|
|
}
|
|
|
|
/**
|
|
* pipe_buf_release - put a reference to a pipe_buffer
|
|
* @pipe: the pipe that the buffer belongs to
|
|
* @buf: the buffer to put a reference to
|
|
*/
|
|
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
|
|
struct pipe_buffer *buf)
|
|
{
|
|
const struct pipe_buf_operations *ops = buf->ops;
|
|
|
|
buf->ops = NULL;
|
|
ops->release(pipe, buf);
|
|
}
|
|
|
|
/**
|
|
* pipe_buf_confirm - verify contents of the pipe buffer
|
|
* @pipe: the pipe that the buffer belongs to
|
|
* @buf: the buffer to confirm
|
|
*/
|
|
static inline int pipe_buf_confirm(struct pipe_inode_info *pipe,
|
|
struct pipe_buffer *buf)
|
|
{
|
|
return buf->ops->confirm(pipe, buf);
|
|
}
|
|
|
|
/**
|
|
* pipe_buf_steal - attempt to take ownership of a pipe_buffer
|
|
* @pipe: the pipe that the buffer belongs to
|
|
* @buf: the buffer to attempt to steal
|
|
*/
|
|
static inline int pipe_buf_steal(struct pipe_inode_info *pipe,
|
|
struct pipe_buffer *buf)
|
|
{
|
|
return buf->ops->steal(pipe, buf);
|
|
}
|
|
|
|
/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
|
|
memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
|
|
#define PIPE_SIZE PAGE_SIZE
|
|
|
|
/* Pipe lock and unlock operations */
|
|
void pipe_lock(struct pipe_inode_info *);
|
|
void pipe_unlock(struct pipe_inode_info *);
|
|
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
|
|
|
|
extern unsigned int pipe_max_size;
|
|
extern unsigned long pipe_user_pages_hard;
|
|
extern unsigned long pipe_user_pages_soft;
|
|
|
|
/* Drop the inode semaphore and wait for a pipe event, atomically */
|
|
void pipe_wait(struct pipe_inode_info *pipe);
|
|
|
|
struct pipe_inode_info *alloc_pipe_info(void);
|
|
void free_pipe_info(struct pipe_inode_info *);
|
|
|
|
/* Generic pipe buffer ops functions */
|
|
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
|
|
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
|
|
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
|
|
int generic_pipe_buf_nosteal(struct pipe_inode_info *, struct pipe_buffer *);
|
|
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
|
|
void pipe_buf_mark_unmergeable(struct pipe_buffer *buf);
|
|
|
|
extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
|
|
|
|
/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
|
|
long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
|
|
struct pipe_inode_info *get_pipe_info(struct file *file);
|
|
|
|
int create_pipe_files(struct file **, int);
|
|
unsigned int round_pipe_size(unsigned long size);
|
|
|
|
#endif
|