mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-15 09:34:17 +00:00
481eaec37e
This adds micro-benchmarks useful for tuning virtio ring layouts. Three layouts are currently implemented: - virtio 0.9 compatible one - an experimental extension bypassing the ring index, polling ring itself instead - an experimental extension bypassing avail and used ring completely Typical use: sh run-on-all.sh perf stat -r 10 --log-fd 1 -- ./ring It doesn't depend on the kernel directly, but it's handy to have as much virtio stuff as possible in one tree. Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
273 lines
6.0 KiB
C
273 lines
6.0 KiB
C
/*
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
|
* Author: Michael S. Tsirkin <mst@redhat.com>
|
|
* This work is licensed under the terms of the GNU GPL, version 2.
|
|
*
|
|
* Simple descriptor-based ring. virtio 0.9 compatible event index is used for
|
|
* signalling, unconditionally.
|
|
*/
|
|
#define _GNU_SOURCE
|
|
#include "main.h"
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
/* Next - Where next entry will be written.
|
|
* Prev - "Next" value when event triggered previously.
|
|
* Event - Peer requested event after writing this entry.
|
|
*/
|
|
static inline bool need_event(unsigned short event,
|
|
unsigned short next,
|
|
unsigned short prev)
|
|
{
|
|
return (unsigned short)(next - event - 1) < (unsigned short)(next - prev);
|
|
}
|
|
|
|
/* Design:
|
|
* Guest adds descriptors with unique index values and DESC_HW in flags.
|
|
* Host overwrites used descriptors with correct len, index, and DESC_HW clear.
|
|
* Flags are always set last.
|
|
*/
|
|
#define DESC_HW 0x1
|
|
|
|
struct desc {
|
|
unsigned short flags;
|
|
unsigned short index;
|
|
unsigned len;
|
|
unsigned long long addr;
|
|
};
|
|
|
|
/* how much padding is needed to avoid false cache sharing */
|
|
#define HOST_GUEST_PADDING 0x80
|
|
|
|
/* Mostly read */
|
|
struct event {
|
|
unsigned short kick_index;
|
|
unsigned char reserved0[HOST_GUEST_PADDING - 2];
|
|
unsigned short call_index;
|
|
unsigned char reserved1[HOST_GUEST_PADDING - 2];
|
|
};
|
|
|
|
struct data {
|
|
void *buf; /* descriptor is writeable, we can't get buf from there */
|
|
void *data;
|
|
} *data;
|
|
|
|
struct desc *ring;
|
|
struct event *event;
|
|
|
|
struct guest {
|
|
unsigned avail_idx;
|
|
unsigned last_used_idx;
|
|
unsigned num_free;
|
|
unsigned kicked_avail_idx;
|
|
unsigned char reserved[HOST_GUEST_PADDING - 12];
|
|
} guest;
|
|
|
|
struct host {
|
|
/* we do not need to track last avail index
|
|
* unless we have more than one in flight.
|
|
*/
|
|
unsigned used_idx;
|
|
unsigned called_used_idx;
|
|
unsigned char reserved[HOST_GUEST_PADDING - 4];
|
|
} host;
|
|
|
|
/* implemented by ring */
|
|
void alloc_ring(void)
|
|
{
|
|
int ret;
|
|
int i;
|
|
|
|
ret = posix_memalign((void **)&ring, 0x1000, ring_size * sizeof *ring);
|
|
if (ret) {
|
|
perror("Unable to allocate ring buffer.\n");
|
|
exit(3);
|
|
}
|
|
event = malloc(sizeof *event);
|
|
if (!event) {
|
|
perror("Unable to allocate event buffer.\n");
|
|
exit(3);
|
|
}
|
|
memset(event, 0, sizeof *event);
|
|
guest.avail_idx = 0;
|
|
guest.kicked_avail_idx = -1;
|
|
guest.last_used_idx = 0;
|
|
host.used_idx = 0;
|
|
host.called_used_idx = -1;
|
|
for (i = 0; i < ring_size; ++i) {
|
|
struct desc desc = {
|
|
.index = i,
|
|
};
|
|
ring[i] = desc;
|
|
}
|
|
guest.num_free = ring_size;
|
|
data = malloc(ring_size * sizeof *data);
|
|
if (!data) {
|
|
perror("Unable to allocate data buffer.\n");
|
|
exit(3);
|
|
}
|
|
memset(data, 0, ring_size * sizeof *data);
|
|
}
|
|
|
|
/* guest side */
|
|
int add_inbuf(unsigned len, void *buf, void *datap)
|
|
{
|
|
unsigned head, index;
|
|
|
|
if (!guest.num_free)
|
|
return -1;
|
|
|
|
guest.num_free--;
|
|
head = (ring_size - 1) & (guest.avail_idx++);
|
|
|
|
/* Start with a write. On MESI architectures this helps
|
|
* avoid a shared state with consumer that is polling this descriptor.
|
|
*/
|
|
ring[head].addr = (unsigned long)(void*)buf;
|
|
ring[head].len = len;
|
|
/* read below might bypass write above. That is OK because it's just an
|
|
* optimization. If this happens, we will get the cache line in a
|
|
* shared state which is unfortunate, but probably not worth it to
|
|
* add an explicit full barrier to avoid this.
|
|
*/
|
|
barrier();
|
|
index = ring[head].index;
|
|
data[index].buf = buf;
|
|
data[index].data = datap;
|
|
/* Barrier A (for pairing) */
|
|
smp_release();
|
|
ring[head].flags = DESC_HW;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void *get_buf(unsigned *lenp, void **bufp)
|
|
{
|
|
unsigned head = (ring_size - 1) & guest.last_used_idx;
|
|
unsigned index;
|
|
void *datap;
|
|
|
|
if (ring[head].flags & DESC_HW)
|
|
return NULL;
|
|
/* Barrier B (for pairing) */
|
|
smp_acquire();
|
|
*lenp = ring[head].len;
|
|
index = ring[head].index & (ring_size - 1);
|
|
datap = data[index].data;
|
|
*bufp = data[index].buf;
|
|
data[index].buf = NULL;
|
|
data[index].data = NULL;
|
|
guest.num_free++;
|
|
guest.last_used_idx++;
|
|
return datap;
|
|
}
|
|
|
|
void poll_used(void)
|
|
{
|
|
unsigned head = (ring_size - 1) & guest.last_used_idx;
|
|
|
|
while (ring[head].flags & DESC_HW)
|
|
busy_wait();
|
|
}
|
|
|
|
void disable_call()
|
|
{
|
|
/* Doing nothing to disable calls might cause
|
|
* extra interrupts, but reduces the number of cache misses.
|
|
*/
|
|
}
|
|
|
|
bool enable_call()
|
|
{
|
|
unsigned head = (ring_size - 1) & guest.last_used_idx;
|
|
|
|
event->call_index = guest.last_used_idx;
|
|
/* Flush call index write */
|
|
/* Barrier D (for pairing) */
|
|
smp_mb();
|
|
return ring[head].flags & DESC_HW;
|
|
}
|
|
|
|
void kick_available(void)
|
|
{
|
|
/* Flush in previous flags write */
|
|
/* Barrier C (for pairing) */
|
|
smp_mb();
|
|
if (!need_event(event->kick_index,
|
|
guest.avail_idx,
|
|
guest.kicked_avail_idx))
|
|
return;
|
|
|
|
guest.kicked_avail_idx = guest.avail_idx;
|
|
kick();
|
|
}
|
|
|
|
/* host side */
|
|
void disable_kick()
|
|
{
|
|
/* Doing nothing to disable kicks might cause
|
|
* extra interrupts, but reduces the number of cache misses.
|
|
*/
|
|
}
|
|
|
|
bool enable_kick()
|
|
{
|
|
unsigned head = (ring_size - 1) & host.used_idx;
|
|
|
|
event->kick_index = host.used_idx;
|
|
/* Barrier C (for pairing) */
|
|
smp_mb();
|
|
return !(ring[head].flags & DESC_HW);
|
|
}
|
|
|
|
void poll_avail(void)
|
|
{
|
|
unsigned head = (ring_size - 1) & host.used_idx;
|
|
|
|
while (!(ring[head].flags & DESC_HW))
|
|
busy_wait();
|
|
}
|
|
|
|
bool use_buf(unsigned *lenp, void **bufp)
|
|
{
|
|
unsigned head = (ring_size - 1) & host.used_idx;
|
|
|
|
if (!(ring[head].flags & DESC_HW))
|
|
return false;
|
|
|
|
/* make sure length read below is not speculated */
|
|
/* Barrier A (for pairing) */
|
|
smp_acquire();
|
|
|
|
/* simple in-order completion: we don't need
|
|
* to touch index at all. This also means we
|
|
* can just modify the descriptor in-place.
|
|
*/
|
|
ring[head].len--;
|
|
/* Make sure len is valid before flags.
|
|
* Note: alternative is to write len and flags in one access -
|
|
* possible on 64 bit architectures but wmb is free on Intel anyway
|
|
* so I have no way to test whether it's a gain.
|
|
*/
|
|
/* Barrier B (for pairing) */
|
|
smp_release();
|
|
ring[head].flags = 0;
|
|
host.used_idx++;
|
|
return true;
|
|
}
|
|
|
|
void call_used(void)
|
|
{
|
|
/* Flush in previous flags write */
|
|
/* Barrier D (for pairing) */
|
|
smp_mb();
|
|
if (!need_event(event->call_index,
|
|
host.used_idx,
|
|
host.called_used_idx))
|
|
return;
|
|
|
|
host.called_used_idx = host.used_idx;
|
|
call();
|
|
}
|