Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2021-05-11

The following pull-request contains BPF updates for your *net* tree.

We've added 13 non-merge commits during the last 8 day(s) which contain
a total of 21 files changed, 817 insertions(+), 382 deletions(-).

The main changes are:

1) Fix multiple ringbuf bugs in particular to prevent writable mmap of
   read-only pages, from Andrii Nakryiko & Thadeu Lima de Souza Cascardo.

2) Fix verifier alu32 known-const subregister bound tracking for bitwise
   operations and/or/xor, from Daniel Borkmann.

3) Reject trampoline attachment for functions with variable arguments,
   and also add a deny list of other forbidden functions, from Jiri Olsa.

4) Fix nested bpf_bprintf_prepare() calls used by various helpers by
   switching to per-CPU buffers, from Florent Revest.

5) Fix kernel compilation with BTF debug info on ppc64 due to pahole
   missing TCP-CC functions like cubictcp_init, from Martin KaFai Lau.

6) Add a kconfig entry to provide an option to disallow unprivileged
   BPF by default, from Daniel Borkmann.

7) Fix libbpf compilation for older libelf when GELF_ST_VISIBILITY()
   macro is not available, from Arnaldo Carvalho de Melo.

8) Migrate test_tc_redirect to test_progs framework as prep work
   for upcoming skb_change_head() fix & selftest, from Jussi Maki.

9) Fix a libbpf segfault in add_dummy_ksym_var() if BTF is not
   present, from Ian Rogers.

10) Fix tx_only micro-benchmark in xdpsock BPF sample with proper frame
    size, from Magnus Karlsson.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-05-11 16:05:56 -07:00
commit df6f823703
21 changed files with 817 additions and 382 deletions

View File

@ -1457,11 +1457,22 @@ unprivileged_bpf_disabled
========================= =========================
Writing 1 to this entry will disable unprivileged calls to ``bpf()``; Writing 1 to this entry will disable unprivileged calls to ``bpf()``;
once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` or ``CAP_BPF``
``-EPERM``. will return ``-EPERM``. Once set to 1, this can't be cleared from the
running kernel anymore.
Once set, this can't be cleared. Writing 2 to this entry will also disable unprivileged calls to ``bpf()``,
however, an admin can still change this setting later on, if needed, by
writing 0 or 1 to this entry.
If ``BPF_UNPRIV_DEFAULT_OFF`` is enabled in the kernel config, then this
entry will default to 2 instead of 0.
= =============================================================
0 Unprivileged calls to ``bpf()`` are enabled
1 Unprivileged calls to ``bpf()`` are disabled without recovery
2 Unprivileged calls to ``bpf()`` are disabled
= =============================================================
watchdog watchdog
======== ========

View File

@ -442,6 +442,7 @@ config AUDITSYSCALL
source "kernel/irq/Kconfig" source "kernel/irq/Kconfig"
source "kernel/time/Kconfig" source "kernel/time/Kconfig"
source "kernel/bpf/Kconfig"
source "kernel/Kconfig.preempt" source "kernel/Kconfig.preempt"
menu "CPU/Task time and stats accounting" menu "CPU/Task time and stats accounting"
@ -1713,46 +1714,6 @@ config KALLSYMS_BASE_RELATIVE
# syscall, maps, verifier # syscall, maps, verifier
config BPF_LSM
bool "LSM Instrumentation with BPF"
depends on BPF_EVENTS
depends on BPF_SYSCALL
depends on SECURITY
depends on BPF_JIT
help
Enables instrumentation of the security hooks with eBPF programs for
implementing dynamic MAC and Audit Policies.
If you are unsure how to answer this question, answer N.
config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate eBPF
programs and maps via file descriptors.
config ARCH_WANT_DEFAULT_BPF_JIT
bool
config BPF_JIT_ALWAYS_ON
bool "Permanently enable BPF JIT and remove BPF interpreter"
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
help
Enables BPF JIT and removes BPF interpreter to avoid
speculative execution of BPF instructions by the interpreter
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
source "kernel/bpf/preload/Kconfig"
config USERFAULTFD config USERFAULTFD
bool "Enable userfaultfd() system call" bool "Enable userfaultfd() system call"
depends on MMU depends on MMU

88
kernel/bpf/Kconfig Normal file
View File

@ -0,0 +1,88 @@
# SPDX-License-Identifier: GPL-2.0-only
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
# eBPF JIT supersedes the cBPF JIT.
# Classic BPF JIT (cBPF)
config HAVE_CBPF_JIT
bool
# Extended BPF JIT (eBPF)
config HAVE_EBPF_JIT
bool
# Used by archs to tell that they want the BPF JIT compiler enabled by
# default for kernels that were compiled with BPF JIT support.
config ARCH_WANT_DEFAULT_BPF_JIT
bool
menu "BPF subsystem"
config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
and maps via file descriptors.
config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
help
BPF programs are normally handled by a BPF interpreter. This option
allows the kernel to generate native code when a program is loaded
into the kernel. This will significantly speed-up processing of BPF
programs.
Note, an admin should enable this feature changing:
/proc/sys/net/core/bpf_jit_enable
/proc/sys/net/core/bpf_jit_harden (optional)
/proc/sys/net/core/bpf_jit_kallsyms (optional)
config BPF_JIT_ALWAYS_ON
bool "Permanently enable BPF JIT and remove BPF interpreter"
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
help
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
config BPF_UNPRIV_DEFAULT_OFF
bool "Disable unprivileged BPF by default"
depends on BPF_SYSCALL
help
Disables unprivileged BPF by default by setting the corresponding
/proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
still reenable it by setting it to 0 later on, or permanently
disable it by setting it to 1 (from which no other transition to
0 is possible anymore).
source "kernel/bpf/preload/Kconfig"
config BPF_LSM
bool "Enable BPF LSM Instrumentation"
depends on BPF_EVENTS
depends on BPF_SYSCALL
depends on SECURITY
depends on BPF_JIT
help
Enables instrumentation of the security hooks with BPF programs for
implementing dynamic MAC and Audit Policies.
If you are unsure how to answer this question, answer N.
endmenu # "BPF subsystem"

View File

@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->ret_size = ret; m->ret_size = ret;
for (i = 0; i < nargs; i++) { for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
bpf_log(log,
"The function %s with variable args is unsupported.\n",
tname);
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t); ret = __get_type_size(btf, args[i].type, &t);
if (ret < 0) { if (ret < 0) {
bpf_log(log, bpf_log(log,
@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL; return -EINVAL;
} }
if (ret == 0) {
bpf_log(log,
"The function %s has malformed void argument.\n",
tname);
return -EINVAL;
}
m->arg_size[i] = ret; m->arg_size[i] = ret;
} }
m->nr_args = nargs; m->nr_args = nargs;

View File

@ -696,34 +696,35 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
*/ */
#define MAX_PRINTF_BUF_LEN 512 #define MAX_PRINTF_BUF_LEN 512
struct bpf_printf_buf { /* Support executing three nested bprintf helper calls on a given CPU */
char tmp_buf[MAX_PRINTF_BUF_LEN]; struct bpf_bprintf_buffers {
char tmp_bufs[3][MAX_PRINTF_BUF_LEN];
}; };
static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_printf_buf_used); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
static int try_get_fmt_tmp_buf(char **tmp_buf) static int try_get_fmt_tmp_buf(char **tmp_buf)
{ {
struct bpf_printf_buf *bufs; struct bpf_bprintf_buffers *bufs;
int used; int nest_level;
preempt_disable(); preempt_disable();
used = this_cpu_inc_return(bpf_printf_buf_used); nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(used > 1)) { if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) {
this_cpu_dec(bpf_printf_buf_used); this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable(); preempt_enable();
return -EBUSY; return -EBUSY;
} }
bufs = this_cpu_ptr(&bpf_printf_buf); bufs = this_cpu_ptr(&bpf_bprintf_bufs);
*tmp_buf = bufs->tmp_buf; *tmp_buf = bufs->tmp_bufs[nest_level - 1];
return 0; return 0;
} }
void bpf_bprintf_cleanup(void) void bpf_bprintf_cleanup(void)
{ {
if (this_cpu_read(bpf_printf_buf_used)) { if (this_cpu_read(bpf_bprintf_nest_level)) {
this_cpu_dec(bpf_printf_buf_used); this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable(); preempt_enable();
} }
} }

View File

@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP; return -ENOTSUPP;
} }
static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
{
size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
/* consumer page + producer page + 2 x data pages */
return RINGBUF_POS_PAGES + 2 * data_pages;
}
static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{ {
struct bpf_ringbuf_map *rb_map; struct bpf_ringbuf_map *rb_map;
size_t mmap_sz;
rb_map = container_of(map, struct bpf_ringbuf_map, map); rb_map = container_of(map, struct bpf_ringbuf_map, map);
mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
return -EINVAL;
if (vma->vm_flags & VM_WRITE) {
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
} else {
vma->vm_flags &= ~VM_MAYWRITE;
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, return remap_vmalloc_range(vma, rb_map->rb,
vma->vm_pgoff + RINGBUF_PGOFF); vma->vm_pgoff + RINGBUF_PGOFF);
} }
@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL; return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
if (len > rb->mask + 1)
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos); cons_pos = smp_load_acquire(&rb->consumer_pos);
if (in_nmi()) { if (in_nmi()) {

View File

@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr); static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock); static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly; int sysctl_unprivileged_bpf_disabled __read_mostly =
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = { static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)

View File

@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value; s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value; u32 umax_val = src_reg->u32_max_value;
/* Assuming scalar64_min_max_and will be called so its safe if (src_known && dst_known) {
* to skip updating register for known 32-bit case. __mark_reg32_known(dst_reg, var32_off.value);
*/
if (src_known && dst_known)
return; return;
}
/* We get our minimum from the var_off, since that's inherently /* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima. * bitwise. Our maximum is the minimum of the operands' maxima.
@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value; dst_reg->s32_max_value = dst_reg->u32_max_value;
} }
} }
static void scalar_min_max_and(struct bpf_reg_state *dst_reg, static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
@ -7155,11 +7153,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value; s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value; u32 umin_val = src_reg->u32_min_value;
/* Assuming scalar64_min_max_or will be called so it is safe if (src_known && dst_known) {
* to skip updating register for known case. __mark_reg32_known(dst_reg, var32_off.value);
*/
if (src_known && dst_known)
return; return;
}
/* We get our maximum from the var_off, and our minimum is the /* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima * maximum of the operands' minima
@ -7224,11 +7221,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
struct tnum var32_off = tnum_subreg(dst_reg->var_off); struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value; s32 smin_val = src_reg->s32_min_value;
/* Assuming scalar64_min_max_xor will be called so it is safe if (src_known && dst_known) {
* to skip updating register for known case. __mark_reg32_known(dst_reg, var32_off.value);
*/
if (src_known && dst_known)
return; return;
}
/* We get both minimum and maximum from the var32_off. */ /* We get both minimum and maximum from the var32_off. */
dst_reg->u32_min_value = var32_off.value; dst_reg->u32_min_value = var32_off.value;
@ -13200,6 +13196,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return 0; return 0;
} }
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
BTF_SET_END(btf_id_deny)
static int check_attach_btf_id(struct bpf_verifier_env *env) static int check_attach_btf_id(struct bpf_verifier_env *env)
{ {
struct bpf_prog *prog = env->prog; struct bpf_prog *prog = env->prog;
@ -13259,6 +13266,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
ret = bpf_lsm_verify_prog(&env->log, prog); ret = bpf_lsm_verify_prog(&env->log, prog);
if (ret < 0) if (ret < 0)
return ret; return ret;
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
} }
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);

View File

@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
mutex_unlock(&bpf_stats_enabled_mutex); mutex_unlock(&bpf_stats_enabled_mutex);
return ret; return ret;
} }
#endif
static int bpf_unpriv_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, unpriv_enable = *(int *)table->data;
bool locked_state = unpriv_enable == 1;
struct ctl_table tmp = *table;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
tmp.data = &unpriv_enable;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
if (locked_state && unpriv_enable != 1)
return -EPERM;
*(int *)table->data = unpriv_enable;
}
return ret;
}
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
/* /*
* /proc/sys support * /proc/sys support
@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_unprivileged_bpf_disabled, .data = &sysctl_unprivileged_bpf_disabled,
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
.mode = 0644, .mode = 0644,
/* only handle a transition from default "0" to "1" */ .proc_handler = bpf_unpriv_handler,
.proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO,
.extra1 = SYSCTL_ONE, .extra2 = &two,
.extra2 = SYSCTL_ONE,
}, },
{ {
.procname = "bpf_stats_enabled", .procname = "bpf_stats_enabled",

View File

@ -302,21 +302,6 @@ config BQL
select DQL select DQL
default y default y
config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
help
Berkeley Packet Filter filtering capabilities are normally handled
by an interpreter. This option allows kernel to generate a native
code when filter is loaded in memory. This should speedup
packet sniffing (libpcap/tcpdump).
Note, admin should enable this feature changing:
/proc/sys/net/core/bpf_jit_enable
/proc/sys/net/core/bpf_jit_harden (optional)
/proc/sys/net/core/bpf_jit_kallsyms (optional)
config BPF_STREAM_PARSER config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER" bool "enable BPF STREAM_PARSER"
depends on INET depends on INET
@ -470,15 +455,3 @@ config ETHTOOL_NETLINK
e.g. notification messages. e.g. notification messages.
endif # if NET endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
# Only one of the two can be selected for a specific arch since eBPF JIT supersedes
# the cBPF JIT.
# Classic BPF JIT (cBPF)
config HAVE_CBPF_JIT
bool
# Extended BPF JIT (eBPF)
config HAVE_EBPF_JIT
bool

View File

@ -185,6 +185,7 @@ BTF_ID(func, tcp_reno_cong_avoid)
BTF_ID(func, tcp_reno_undo_cwnd) BTF_ID(func, tcp_reno_undo_cwnd)
BTF_ID(func, tcp_slow_start) BTF_ID(func, tcp_slow_start)
BTF_ID(func, tcp_cong_avoid_ai) BTF_ID(func, tcp_cong_avoid_ai)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE #ifdef CONFIG_DYNAMIC_FTRACE
#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) #if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
BTF_ID(func, cubictcp_init) BTF_ID(func, cubictcp_init)
@ -213,6 +214,7 @@ BTF_ID(func, bbr_min_tso_segs)
BTF_ID(func, bbr_set_state) BTF_ID(func, bbr_set_state)
#endif #endif
#endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_X86 */
BTF_SET_END(bpf_tcp_ca_kfunc_ids) BTF_SET_END(bpf_tcp_ca_kfunc_ids)
static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)

View File

@ -1255,7 +1255,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
for (i = 0; i < batch_size; i++) { for (i = 0; i < batch_size; i++) {
struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
idx + i); idx + i);
tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT; tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size;
tx_desc->len = PKT_SIZE; tx_desc->len = PKT_SIZE;
} }

View File

@ -3216,6 +3216,9 @@ static int add_dummy_ksym_var(struct btf *btf)
const struct btf_var_secinfo *vs; const struct btf_var_secinfo *vs;
const struct btf_type *sec; const struct btf_type *sec;
if (!btf)
return 0;
sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC,
BTF_KIND_DATASEC); BTF_KIND_DATASEC);
if (sec_btf_id < 0) if (sec_btf_id < 0)

View File

@ -41,6 +41,11 @@
#define ELF_C_READ_MMAP ELF_C_READ #define ELF_C_READ_MMAP ELF_C_READ
#endif #endif
/* Older libelf all end up in this expression, for both 32 and 64 bit */
#ifndef GELF_ST_VISIBILITY
#define GELF_ST_VISIBILITY(o) ((o) & 0x03)
#endif
#define BTF_INFO_ENC(kind, kind_flag, vlen) \ #define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)

View File

@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
.tcp.doff = 5, .tcp.doff = 5,
}; };
static int settimeo(int fd, int timeout_ms) int settimeo(int fd, int timeout_ms)
{ {
struct timeval timeout = { .tv_sec = 3 }; struct timeval timeout = { .tv_sec = 3 };

View File

@ -33,6 +33,7 @@ struct ipv6_packet {
} __packed; } __packed;
extern struct ipv6_packet pkt_v6; extern struct ipv6_packet pkt_v6;
int settimeo(int fd, int timeout_ms);
int start_server(int family, int type, const char *addr, __u16 port, int start_server(int family, int type, const char *addr, __u16 port,
int timeout_ms); int timeout_ms);
int connect_to_fd(int server_fd, int timeout_ms); int connect_to_fd(int server_fd, int timeout_ms);

View File

@ -0,0 +1,589 @@
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
* This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
* between src and dst. The netns fwd has veth links to each src and dst. The
* client is in src and server in dst. The test installs a TC BPF program to each
* host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
* neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
* switch from ingress side; it also installs a checker prog on the egress side
* to drop unexpected traffic.
*/
#define _GNU_SOURCE
#include <fcntl.h>
#include <linux/limits.h>
#include <linux/sysctl.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "test_progs.h"
#include "network_helpers.h"
#include "test_tc_neigh_fib.skel.h"
#include "test_tc_neigh.skel.h"
#include "test_tc_peer.skel.h"
#define NS_SRC "ns_src"
#define NS_FWD "ns_fwd"
#define NS_DST "ns_dst"
#define IP4_SRC "172.16.1.100"
#define IP4_DST "172.16.2.100"
#define IP4_PORT 9004
#define IP6_SRC "::1:dead:beef:cafe"
#define IP6_DST "::2:dead:beef:cafe"
#define IP6_PORT 9006
#define IP4_SLL "169.254.0.1"
#define IP4_DLL "169.254.0.2"
#define IP4_NET "169.254.0.0"
#define IFADDR_STR_LEN 18
#define PING_ARGS "-c 3 -w 10 -q"
#define SRC_PROG_PIN_FILE "/sys/fs/bpf/test_tc_src"
#define DST_PROG_PIN_FILE "/sys/fs/bpf/test_tc_dst"
#define CHK_PROG_PIN_FILE "/sys/fs/bpf/test_tc_chk"
#define TIMEOUT_MILLIS 10000
#define MAX_PROC_MODS 128
#define MAX_PROC_VALUE_LEN 16
#define log_err(MSG, ...) \
fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
__FILE__, __LINE__, strerror(errno), ##__VA_ARGS__)
struct proc_mod {
char path[PATH_MAX];
char oldval[MAX_PROC_VALUE_LEN];
int oldlen;
};
static const char * const namespaces[] = {NS_SRC, NS_FWD, NS_DST, NULL};
static int root_netns_fd = -1;
static int num_proc_mods;
static struct proc_mod proc_mods[MAX_PROC_MODS];
/**
* modify_proc() - Modify entry in /proc
*
* Modifies an entry in /proc and saves the original value for later
* restoration with restore_proc().
*/
static int modify_proc(const char *path, const char *newval)
{
struct proc_mod *mod;
FILE *f;
if (num_proc_mods + 1 > MAX_PROC_MODS)
return -1;
f = fopen(path, "r+");
if (!f)
return -1;
mod = &proc_mods[num_proc_mods];
num_proc_mods++;
strncpy(mod->path, path, PATH_MAX);
if (!fread(mod->oldval, 1, MAX_PROC_VALUE_LEN, f)) {
log_err("reading from %s failed", path);
goto fail;
}
rewind(f);
if (fwrite(newval, strlen(newval), 1, f) != 1) {
log_err("writing to %s failed", path);
goto fail;
}
fclose(f);
return 0;
fail:
fclose(f);
num_proc_mods--;
return -1;
}
/**
* restore_proc() - Restore all /proc modifications
*/
static void restore_proc(void)
{
int i;
for (i = 0; i < num_proc_mods; i++) {
struct proc_mod *mod = &proc_mods[i];
FILE *f;
f = fopen(mod->path, "w");
if (!f) {
log_err("fopen of %s failed", mod->path);
continue;
}
if (fwrite(mod->oldval, mod->oldlen, 1, f) != 1)
log_err("fwrite to %s failed", mod->path);
fclose(f);
}
num_proc_mods = 0;
}
/**
* setns_by_name() - Set networks namespace by name
*/
static int setns_by_name(const char *name)
{
int nsfd;
char nspath[PATH_MAX];
int err;
snprintf(nspath, sizeof(nspath), "%s/%s", "/var/run/netns", name);
nsfd = open(nspath, O_RDONLY | O_CLOEXEC);
if (nsfd < 0)
return nsfd;
err = setns(nsfd, CLONE_NEWNET);
close(nsfd);
return err;
}
/**
* setns_root() - Set network namespace to original (root) namespace
*
* Not expected to ever fail, so error not returned, but failure logged
* and test marked as failed.
*/
static void setns_root(void)
{
ASSERT_OK(setns(root_netns_fd, CLONE_NEWNET), "setns root");
}
static int netns_setup_namespaces(const char *verb)
{
const char * const *ns = namespaces;
char cmd[128];
while (*ns) {
snprintf(cmd, sizeof(cmd), "ip netns %s %s", verb, *ns);
if (!ASSERT_OK(system(cmd), cmd))
return -1;
ns++;
}
return 0;
}
struct netns_setup_result {
int ifindex_veth_src_fwd;
int ifindex_veth_dst_fwd;
};
static int get_ifaddr(const char *name, char *ifaddr)
{
char path[PATH_MAX];
FILE *f;
int ret;
snprintf(path, PATH_MAX, "/sys/class/net/%s/address", name);
f = fopen(path, "r");
if (!ASSERT_OK_PTR(f, path))
return -1;
ret = fread(ifaddr, 1, IFADDR_STR_LEN, f);
if (!ASSERT_EQ(ret, IFADDR_STR_LEN, "fread ifaddr")) {
fclose(f);
return -1;
}
fclose(f);
return 0;
}
static int get_ifindex(const char *name)
{
char path[PATH_MAX];
char buf[32];
FILE *f;
int ret;
snprintf(path, PATH_MAX, "/sys/class/net/%s/ifindex", name);
f = fopen(path, "r");
if (!ASSERT_OK_PTR(f, path))
return -1;
ret = fread(buf, 1, sizeof(buf), f);
if (!ASSERT_GT(ret, 0, "fread ifindex")) {
fclose(f);
return -1;
}
fclose(f);
return atoi(buf);
}
#define SYS(fmt, ...) \
({ \
char cmd[1024]; \
snprintf(cmd, sizeof(cmd), fmt, ##__VA_ARGS__); \
if (!ASSERT_OK(system(cmd), cmd)) \
goto fail; \
})
static int netns_setup_links_and_routes(struct netns_setup_result *result)
{
char veth_src_fwd_addr[IFADDR_STR_LEN+1] = {};
char veth_dst_fwd_addr[IFADDR_STR_LEN+1] = {};
SYS("ip link add veth_src type veth peer name veth_src_fwd");
SYS("ip link add veth_dst type veth peer name veth_dst_fwd");
if (get_ifaddr("veth_src_fwd", veth_src_fwd_addr))
goto fail;
if (get_ifaddr("veth_dst_fwd", veth_dst_fwd_addr))
goto fail;
result->ifindex_veth_src_fwd = get_ifindex("veth_src_fwd");
if (result->ifindex_veth_src_fwd < 0)
goto fail;
result->ifindex_veth_dst_fwd = get_ifindex("veth_dst_fwd");
if (result->ifindex_veth_dst_fwd < 0)
goto fail;
SYS("ip link set veth_src netns " NS_SRC);
SYS("ip link set veth_src_fwd netns " NS_FWD);
SYS("ip link set veth_dst_fwd netns " NS_FWD);
SYS("ip link set veth_dst netns " NS_DST);
/** setup in 'src' namespace */
if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
goto fail;
SYS("ip addr add " IP4_SRC "/32 dev veth_src");
SYS("ip addr add " IP6_SRC "/128 dev veth_src nodad");
SYS("ip link set dev veth_src up");
SYS("ip route add " IP4_DST "/32 dev veth_src scope global");
SYS("ip route add " IP4_NET "/16 dev veth_src scope global");
SYS("ip route add " IP6_DST "/128 dev veth_src scope global");
SYS("ip neigh add " IP4_DST " dev veth_src lladdr %s",
veth_src_fwd_addr);
SYS("ip neigh add " IP6_DST " dev veth_src lladdr %s",
veth_src_fwd_addr);
/** setup in 'fwd' namespace */
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
goto fail;
/* The fwd netns automatically gets a v6 LL address / routes, but also
* needs v4 one in order to start ARP probing. IP4_NET route is added
* to the endpoints so that the ARP processing will reply.
*/
SYS("ip addr add " IP4_SLL "/32 dev veth_src_fwd");
SYS("ip addr add " IP4_DLL "/32 dev veth_dst_fwd");
SYS("ip link set dev veth_src_fwd up");
SYS("ip link set dev veth_dst_fwd up");
SYS("ip route add " IP4_SRC "/32 dev veth_src_fwd scope global");
SYS("ip route add " IP6_SRC "/128 dev veth_src_fwd scope global");
SYS("ip route add " IP4_DST "/32 dev veth_dst_fwd scope global");
SYS("ip route add " IP6_DST "/128 dev veth_dst_fwd scope global");
/** setup in 'dst' namespace */
if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
goto fail;
SYS("ip addr add " IP4_DST "/32 dev veth_dst");
SYS("ip addr add " IP6_DST "/128 dev veth_dst nodad");
SYS("ip link set dev veth_dst up");
SYS("ip route add " IP4_SRC "/32 dev veth_dst scope global");
SYS("ip route add " IP4_NET "/16 dev veth_dst scope global");
SYS("ip route add " IP6_SRC "/128 dev veth_dst scope global");
SYS("ip neigh add " IP4_SRC " dev veth_dst lladdr %s",
veth_dst_fwd_addr);
SYS("ip neigh add " IP6_SRC " dev veth_dst lladdr %s",
veth_dst_fwd_addr);
setns_root();
return 0;
fail:
setns_root();
return -1;
}
static int netns_load_bpf(void)
{
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
return -1;
SYS("tc qdisc add dev veth_src_fwd clsact");
SYS("tc filter add dev veth_src_fwd ingress bpf da object-pinned "
SRC_PROG_PIN_FILE);
SYS("tc filter add dev veth_src_fwd egress bpf da object-pinned "
CHK_PROG_PIN_FILE);
SYS("tc qdisc add dev veth_dst_fwd clsact");
SYS("tc filter add dev veth_dst_fwd ingress bpf da object-pinned "
DST_PROG_PIN_FILE);
SYS("tc filter add dev veth_dst_fwd egress bpf da object-pinned "
CHK_PROG_PIN_FILE);
setns_root();
return -1;
fail:
setns_root();
return -1;
}
static int netns_unload_bpf(void)
{
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
goto fail;
SYS("tc qdisc delete dev veth_src_fwd clsact");
SYS("tc qdisc delete dev veth_dst_fwd clsact");
setns_root();
return 0;
fail:
setns_root();
return -1;
}
static void test_tcp(int family, const char *addr, __u16 port)
{
int listen_fd = -1, accept_fd = -1, client_fd = -1;
char buf[] = "testing testing";
int n;
if (!ASSERT_OK(setns_by_name(NS_DST), "setns dst"))
return;
listen_fd = start_server(family, SOCK_STREAM, addr, port, 0);
if (!ASSERT_GE(listen_fd, 0, "listen"))
goto done;
if (!ASSERT_OK(setns_by_name(NS_SRC), "setns src"))
goto done;
client_fd = connect_to_fd(listen_fd, TIMEOUT_MILLIS);
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
goto done;
accept_fd = accept(listen_fd, NULL, NULL);
if (!ASSERT_GE(accept_fd, 0, "accept"))
goto done;
if (!ASSERT_OK(settimeo(accept_fd, TIMEOUT_MILLIS), "settimeo"))
goto done;
n = write(client_fd, buf, sizeof(buf));
if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
goto done;
n = read(accept_fd, buf, sizeof(buf));
ASSERT_EQ(n, sizeof(buf), "recv from server");
done:
setns_root();
if (listen_fd >= 0)
close(listen_fd);
if (accept_fd >= 0)
close(accept_fd);
if (client_fd >= 0)
close(client_fd);
}
static int test_ping(int family, const char *addr)
{
const char *ping = family == AF_INET6 ? "ping6" : "ping";
SYS("ip netns exec " NS_SRC " %s " PING_ARGS " %s", ping, addr);
return 0;
fail:
return -1;
}
static void test_connectivity(void)
{
test_tcp(AF_INET, IP4_DST, IP4_PORT);
test_ping(AF_INET, IP4_DST);
test_tcp(AF_INET6, IP6_DST, IP6_PORT);
test_ping(AF_INET6, IP6_DST);
}
static void test_tc_redirect_neigh_fib(struct netns_setup_result *setup_result)
{
struct test_tc_neigh_fib *skel;
int err;
skel = test_tc_neigh_fib__open();
if (!ASSERT_OK_PTR(skel, "test_tc_neigh_fib__open"))
return;
if (!ASSERT_OK(test_tc_neigh_fib__load(skel), "test_tc_neigh_fib__load")) {
test_tc_neigh_fib__destroy(skel);
return;
}
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
goto done;
if (netns_load_bpf())
goto done;
/* bpf_fib_lookup() checks if forwarding is enabled */
if (!ASSERT_OK(setns_by_name(NS_FWD), "setns fwd"))
goto done;
err = modify_proc("/proc/sys/net/ipv4/ip_forward", "1");
if (!ASSERT_OK(err, "set ipv4.ip_forward"))
goto done;
err = modify_proc("/proc/sys/net/ipv6/conf/all/forwarding", "1");
if (!ASSERT_OK(err, "set ipv6.forwarding"))
goto done;
setns_root();
test_connectivity();
done:
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
test_tc_neigh_fib__destroy(skel);
netns_unload_bpf();
setns_root();
restore_proc();
}
static void test_tc_redirect_neigh(struct netns_setup_result *setup_result)
{
struct test_tc_neigh *skel;
int err;
skel = test_tc_neigh__open();
if (!ASSERT_OK_PTR(skel, "test_tc_neigh__open"))
return;
skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
err = test_tc_neigh__load(skel);
if (!ASSERT_OK(err, "test_tc_neigh__load")) {
test_tc_neigh__destroy(skel);
return;
}
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
goto done;
if (netns_load_bpf())
goto done;
test_connectivity();
done:
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
test_tc_neigh__destroy(skel);
netns_unload_bpf();
setns_root();
}
static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
{
struct test_tc_peer *skel;
int err;
skel = test_tc_peer__open();
if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
return;
skel->rodata->IFINDEX_SRC = setup_result->ifindex_veth_src_fwd;
skel->rodata->IFINDEX_DST = setup_result->ifindex_veth_dst_fwd;
err = test_tc_peer__load(skel);
if (!ASSERT_OK(err, "test_tc_peer__load")) {
test_tc_peer__destroy(skel);
return;
}
err = bpf_program__pin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " SRC_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " CHK_PROG_PIN_FILE))
goto done;
err = bpf_program__pin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
if (!ASSERT_OK(err, "pin " DST_PROG_PIN_FILE))
goto done;
if (netns_load_bpf())
goto done;
test_connectivity();
done:
bpf_program__unpin(skel->progs.tc_src, SRC_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_chk, CHK_PROG_PIN_FILE);
bpf_program__unpin(skel->progs.tc_dst, DST_PROG_PIN_FILE);
test_tc_peer__destroy(skel);
netns_unload_bpf();
setns_root();
}
void test_tc_redirect(void)
{
struct netns_setup_result setup_result;
root_netns_fd = open("/proc/self/ns/net", O_RDONLY);
if (!ASSERT_GE(root_netns_fd, 0, "open /proc/self/ns/net"))
return;
if (netns_setup_namespaces("add"))
goto done;
if (netns_setup_links_and_routes(&setup_result))
goto done;
if (test__start_subtest("tc_redirect_peer"))
test_tc_redirect_peer(&setup_result);
if (test__start_subtest("tc_redirect_neigh"))
test_tc_redirect_neigh(&setup_result);
if (test__start_subtest("tc_redirect_neigh_fib"))
test_tc_redirect_neigh_fib(&setup_result);
done:
close(root_netns_fd);
netns_setup_namespaces("delete");
}

View File

@ -33,17 +33,8 @@
a.s6_addr32[3] == b.s6_addr32[3]) a.s6_addr32[3] == b.s6_addr32[3])
#endif #endif
enum { static volatile const __u32 IFINDEX_SRC;
dev_src, static volatile const __u32 IFINDEX_DST;
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb, static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
__be32 addr) __be32 addr)
@ -79,14 +70,8 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
return v6_equal(ip6h->daddr, addr); return v6_equal(ip6h->daddr, addr);
} }
static __always_inline int get_dev_ifindex(int which) SEC("classifier/chk_egress")
{ int tc_chk(struct __sk_buff *skb)
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{ {
void *data_end = ctx_ptr(skb->data_end); void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data); void *data = ctx_ptr(skb->data);
@ -98,7 +83,8 @@ SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK; return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
} }
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{ {
__u8 zero[ETH_ALEN * 2]; __u8 zero[ETH_ALEN * 2];
bool redirect = false; bool redirect = false;
@ -119,10 +105,11 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT; return TC_ACT_SHOT;
return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0); return bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0);
} }
SEC("src_ingress") int tc_src(struct __sk_buff *skb) SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{ {
__u8 zero[ETH_ALEN * 2]; __u8 zero[ETH_ALEN * 2];
bool redirect = false; bool redirect = false;
@ -143,7 +130,7 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0) if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT; return TC_ACT_SHOT;
return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0); return bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0);
} }
char __license[] SEC("license") = "GPL"; char __license[] SEC("license") = "GPL";

View File

@ -75,7 +75,8 @@ static __always_inline int fill_fib_params_v6(struct __sk_buff *skb,
return 0; return 0;
} }
SEC("chk_egress") int tc_chk(struct __sk_buff *skb) SEC("classifier/chk_egress")
int tc_chk(struct __sk_buff *skb)
{ {
void *data_end = ctx_ptr(skb->data_end); void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data); void *data = ctx_ptr(skb->data);
@ -142,12 +143,14 @@ static __always_inline int tc_redir(struct __sk_buff *skb)
/* these are identical, but keep them separate for compatibility with the /* these are identical, but keep them separate for compatibility with the
* section names expected by test_tc_redirect.sh * section names expected by test_tc_redirect.sh
*/ */
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{ {
return tc_redir(skb); return tc_redir(skb);
} }
SEC("src_ingress") int tc_src(struct __sk_buff *skb) SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{ {
return tc_redir(skb); return tc_redir(skb);
} }

View File

@ -8,38 +8,25 @@
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
enum { static volatile const __u32 IFINDEX_SRC;
dev_src, static volatile const __u32 IFINDEX_DST;
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = { SEC("classifier/chk_egress")
.type = BPF_MAP_TYPE_ARRAY, int tc_chk(struct __sk_buff *skb)
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{ {
return TC_ACT_SHOT; return TC_ACT_SHOT;
} }
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb) SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{ {
return bpf_redirect_peer(get_dev_ifindex(dev_src), 0); return bpf_redirect_peer(IFINDEX_SRC, 0);
} }
SEC("src_ingress") int tc_src(struct __sk_buff *skb) SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{ {
return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0); return bpf_redirect_peer(IFINDEX_DST, 0);
} }
char __license[] SEC("license") = "GPL"; char __license[] SEC("license") = "GPL";

View File

@ -1,216 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
# switch from ingress side; it also installs a checker prog on the egress side
# to drop unexpected traffic.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that needed tools are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi
command -v perl >/dev/null 2>&1 || \
{ echo >&2 "perl is not available"; exit 1; }
command -v jq >/dev/null 2>&1 || \
{ echo >&2 "jq is not available"; exit 1; }
command -v bpftool >/dev/null 2>&1 || \
{ echo >&2 "bpftool is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
netns_cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
netns_setup()
{
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also
# needs v4 one in order to start ARP probing. IP4_NET route is added
# to the endpoints so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
}
netns_test_connectivity()
{
set +e
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
set -e
}
hex_mem_str()
{
perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
}
netns_setup_bpf()
{
local obj=$1
local use_forwarding=${2:-0}
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
if [ "$use_forwarding" -eq "1" ]; then
# bpf_fib_lookup() checks if forwarding is enabled
ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1
return 0
fi
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
for prog in $progs; do
map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
if [ ! -z "$map" ]; then
bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
fi
done
}
trap netns_cleanup EXIT
set -e
netns_setup
netns_setup_bpf test_tc_neigh.o
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_neigh_fib.o 1
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_peer.o
netns_test_connectivity