From c078ca3b0c5bf82c2b31906c446d6e2ad8ea0783 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 17 Jan 2017 22:51:26 +0100 Subject: [PATCH 01/21] netfilter: nft_exthdr: Add support for existence check If NFT_EXTHDR_F_PRESENT is set, exthdr will not copy any header field data into *dest, but instead set it to 1 if the header is found and 0 otherwise. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ net/netfilter/nft_exthdr.c | 22 ++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7b730cab99bd..53aac8b8ed6b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -704,6 +704,10 @@ enum nft_payload_attributes { }; #define NFTA_PAYLOAD_MAX (__NFTA_PAYLOAD_MAX - 1) +enum nft_exthdr_flags { + NFT_EXTHDR_F_PRESENT = (1 << 0), +}; + /** * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes * @@ -711,6 +715,7 @@ enum nft_payload_attributes { * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8) * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) + * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -718,6 +723,7 @@ enum nft_exthdr_attributes { NFTA_EXTHDR_TYPE, NFTA_EXTHDR_OFFSET, NFTA_EXTHDR_LEN, + NFTA_EXTHDR_FLAGS, __NFTA_EXTHDR_MAX }; #define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 47beb3abcc9d..a89e5ab150db 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -23,6 +23,7 @@ struct nft_exthdr { u8 offset; u8 len; enum nft_registers dreg:8; + u8 flags; }; static void nft_exthdr_eval(const struct nft_expr *expr, @@ -35,8 +36,12 @@ static void nft_exthdr_eval(const struct nft_expr *expr, int err; err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); - if (err < 0) + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = (err >= 0); + return; + } else if (err < 0) { goto err; + } offset += priv->offset; dest[priv->len / NFT_REG32_SIZE] = 0; @@ -52,6 +57,7 @@ static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, [NFTA_EXTHDR_OFFSET] = { .type = NLA_U32 }, [NFTA_EXTHDR_LEN] = { .type = NLA_U32 }, + [NFTA_EXTHDR_FLAGS] = { .type = NLA_U32 }, }; static int nft_exthdr_init(const struct nft_ctx *ctx, @@ -59,7 +65,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len; + u32 offset, len, flags = 0; int err; if (tb[NFTA_EXTHDR_DREG] == NULL || @@ -76,10 +82,20 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, if (err < 0) return err; + if (tb[NFTA_EXTHDR_FLAGS]) { + err = nft_parse_u32_check(tb[NFTA_EXTHDR_FLAGS], U8_MAX, &flags); + if (err < 0) + return err; + + if (flags & ~NFT_EXTHDR_F_PRESENT) + return -EINVAL; + } + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); + priv->flags = flags; return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, priv->len); @@ -97,6 +113,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_LEN, htonl(priv->len))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags))) + goto nla_put_failure; return 0; nla_put_failure: From 5cb82a38c6b5152b1deaba0c1596ce63222a4710 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:07 +0100 Subject: [PATCH 02/21] netfilter: nf_tables: pass netns to set->ops->remove() This new parameter is required by the new bitmap set type that comes in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 6 +++--- net/netfilter/nft_set_hash.c | 3 ++- net/netfilter/nft_set_rbtree.c | 3 ++- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7dfdb517f0be..a721bcb1210c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -298,7 +298,8 @@ struct nft_set_ops { bool (*deactivate_one)(const struct net *net, const struct nft_set *set, void *priv); - void (*remove)(const struct nft_set *set, + void (*remove)(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 57eeae63f597..3643ce345b59 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3752,7 +3752,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return 0; err6: - set->ops->remove(set, &elem); + set->ops->remove(ctx->net, set, &elem); err5: kfree(trans); err4: @@ -4804,7 +4804,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_DELSETELEM, 0); - te->set->ops->remove(te->set, &te->elem); + te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); te->set->ndeact--; break; @@ -4925,7 +4925,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->remove(te->set, &te->elem); + te->set->ops->remove(net, te->set, &te->elem); atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index e36069fb76ae..bb157bd47fe8 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -203,7 +203,8 @@ static void *nft_hash_deactivate(const struct net *net, return he; } -static void nft_hash_remove(const struct nft_set *set, +static void nft_hash_remove(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_hash *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index f06f55ee516d..9fbd70da1633 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -151,7 +151,8 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, return err; } -static void nft_rbtree_remove(const struct nft_set *set, +static void nft_rbtree_remove(const struct net *net, + const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree *priv = nft_set_priv(set); From baa2d42cff088d9f962697a10e0345b444c8c409 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:08 +0100 Subject: [PATCH 03/21] netfilter: nf_tables: use struct nft_set_iter in set element flush Instead of struct nft_set_dump_args, remove unnecessary wrapper structure. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3643ce345b59..790ffed82930 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3936,15 +3936,13 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, return -EBUSY; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) { - struct nft_set_dump_args args = { - .iter = { - .genmask = genmask, - .fn = nft_flush_set, - }, + struct nft_set_iter iter = { + .genmask = genmask, + .fn = nft_flush_set, }; - set->ops->walk(&ctx, set, &args.iter); + set->ops->walk(&ctx, set, &iter); - return args.iter.err; + return iter.err; } nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { From 1ba1c41408df8a9d2f8b9b67e4c9e6f59b29d8ee Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:09 +0100 Subject: [PATCH 04/21] netfilter: nf_tables: rename deactivate_one() to flush() Although semantics are similar to deactivate() with no implicit element lookup, this is only called from the set flush path, so better rename this to flush(). Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++---- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_set_hash.c | 8 ++++---- net/netfilter/nft_set_rbtree.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a721bcb1210c..ab155644d489 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -260,7 +260,7 @@ struct nft_expr; * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation - * @deactivate_one: deactivate element in the next generation + * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elemeennts * @privsize: function to return size of set private data @@ -295,9 +295,9 @@ struct nft_set_ops { void * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); - bool (*deactivate_one)(const struct net *net, - const struct nft_set *set, - void *priv); + bool (*flush)(const struct net *net, + const struct nft_set *set, + void *priv); void (*remove)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 790ffed82930..c09b11eb36fc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3898,7 +3898,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, if (!trans) return -ENOMEM; - if (!set->ops->deactivate_one(ctx->net, set, elem->priv)) { + if (!set->ops->flush(ctx->net, set, elem->priv)) { err = -ENOENT; goto err1; } diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index bb157bd47fe8..2f10ac3b1b10 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -167,8 +167,8 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set, nft_set_elem_clear_busy(&he->ext); } -static bool nft_hash_deactivate_one(const struct net *net, - const struct nft_set *set, void *priv) +static bool nft_hash_flush(const struct net *net, + const struct nft_set *set, void *priv) { struct nft_hash_elem *he = priv; @@ -195,7 +195,7 @@ static void *nft_hash_deactivate(const struct net *net, rcu_read_lock(); he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); if (he != NULL && - !nft_hash_deactivate_one(net, set, he)) + !nft_hash_flush(net, set, he)) he = NULL; rcu_read_unlock(); @@ -398,7 +398,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .insert = nft_hash_insert, .activate = nft_hash_activate, .deactivate = nft_hash_deactivate, - .deactivate_one = nft_hash_deactivate_one, + .flush = nft_hash_flush, .remove = nft_hash_remove, .lookup = nft_hash_lookup, .update = nft_hash_update, diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 9fbd70da1633..81b8a4c2c061 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -172,8 +172,8 @@ static void nft_rbtree_activate(const struct net *net, nft_set_elem_change_active(net, set, &rbe->ext); } -static bool nft_rbtree_deactivate_one(const struct net *net, - const struct nft_set *set, void *priv) +static bool nft_rbtree_flush(const struct net *net, + const struct nft_set *set, void *priv) { struct nft_rbtree_elem *rbe = priv; @@ -214,7 +214,7 @@ static void *nft_rbtree_deactivate(const struct net *net, parent = parent->rb_right; continue; } - nft_rbtree_deactivate_one(net, set, rbe); + nft_rbtree_flush(net, set, rbe); return rbe; } } @@ -305,7 +305,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .insert = nft_rbtree_insert, .remove = nft_rbtree_remove, .deactivate = nft_rbtree_deactivate, - .deactivate_one = nft_rbtree_deactivate_one, + .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, From 1f48ff6c5393aa7fe290faf5d633164f105b0aa7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:10 +0100 Subject: [PATCH 05/21] netfilter: nf_tables: add flush field to struct nft_set_iter This provides context to walk callback iterator, thus, we know if the walk happens from the set flush path. This is required by the new bitmap set type coming in a follow up patch which has no real struct nft_set_ext, so it has to allocate it based on the two bit compact element representation. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nf_tables_api.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ab155644d489..5830f594842e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -203,6 +203,7 @@ struct nft_set_elem { struct nft_set; struct nft_set_iter { u8 genmask; + bool flush; unsigned int count; unsigned int skip; int err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c09b11eb36fc..7ae810b03462 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3121,6 +3121,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; + iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) @@ -3374,6 +3375,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.count = 0; args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; + args.iter.flush = false; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3939,6 +3941,7 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_set_iter iter = { .genmask = genmask, .fn = nft_flush_set, + .flush = true, }; set->ops->walk(&ctx, set, &iter); @@ -5089,6 +5092,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.count = 0; iter.err = 0; iter.fn = nf_tables_loop_check_setelem; + iter.flush = false; set->ops->walk(ctx, set, &iter); if (iter.err < 0) From 55af753cd9fda9c5300f5318253b08bd15fb412e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:11 +0100 Subject: [PATCH 06/21] netfilter: nf_tables: rename struct nft_set_estimate class field Use lookup as field name instead, to prepare the introduction of the memory class in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- net/netfilter/nf_tables_api.c | 12 ++++++------ net/netfilter/nft_set_hash.c | 2 +- net/netfilter/nft_set_rbtree.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5830f594842e..d76ac2f80a40 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -244,11 +244,11 @@ enum nft_set_class { * characteristics * * @size: required memory - * @class: lookup performance class + * @lookup: lookup performance class */ struct nft_set_estimate { unsigned int size; - enum nft_set_class class; + enum nft_set_class lookup; }; struct nft_set_ext; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7ae810b03462..fa7cd1679079 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2401,9 +2401,9 @@ nft_select_set_ops(const struct nlattr * const nla[], features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; } - bops = NULL; - best.size = ~0; - best.class = ~0; + bops = NULL; + best.size = ~0; + best.lookup = ~0; list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) @@ -2413,15 +2413,15 @@ nft_select_set_ops(const struct nlattr * const nla[], switch (policy) { case NFT_SET_POL_PERFORMANCE: - if (est.class < best.class) + if (est.lookup < best.lookup) break; - if (est.class == best.class && est.size < best.size) + if (est.lookup == best.lookup && est.size < best.size) break; continue; case NFT_SET_POL_MEMORY: if (est.size < best.size) break; - if (est.size == best.size && est.class < best.class) + if (est.size == best.size && est.lookup < best.lookup) break; continue; default: diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 2f10ac3b1b10..e58e7f02138b 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -384,7 +384,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, est->size = esize + 2 * sizeof(struct nft_hash_elem *); } - est->class = NFT_SET_CLASS_O_1; + est->lookup = NFT_SET_CLASS_O_1; return true; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 81b8a4c2c061..2b6ea10c4bbd 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -291,7 +291,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, else est->size = nsize; - est->class = NFT_SET_CLASS_O_LOG_N; + est->lookup = NFT_SET_CLASS_O_LOG_N; return true; } From 0b5a78749260560f41e3b7c1f60f2c7dd9aff4f0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:12 +0100 Subject: [PATCH 07/21] netfilter: nf_tables: add space notation to sets The space notation allows us to classify the set backend implementation based on the amount of required memory. This provides an order of the set representation scalability in terms of memory. The size field is still left in place so use this if the userspace provides no explicit number of elements, so we cannot calculate the real memory that this set needs. This also helps us break ties in the set backend selection routine, eg. two backend implementations provide the same performance. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 22 +++++++++++++++++----- net/netfilter/nft_set_hash.c | 1 + net/netfilter/nft_set_rbtree.c | 1 + 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d76ac2f80a40..21ce50e6d0c5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -245,10 +245,12 @@ enum nft_set_class { * * @size: required memory * @lookup: lookup performance class + * @space: memory class */ struct nft_set_estimate { unsigned int size; enum nft_set_class lookup; + enum nft_set_class space; }; struct nft_set_ext; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fa7cd1679079..cb6ae46f6c48 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2404,6 +2404,7 @@ nft_select_set_ops(const struct nlattr * const nla[], bops = NULL; best.size = ~0; best.lookup = ~0; + best.space = ~0; list_for_each_entry(ops, &nf_tables_set_ops, list) { if ((ops->features & features) != features) @@ -2415,14 +2416,25 @@ nft_select_set_ops(const struct nlattr * const nla[], case NFT_SET_POL_PERFORMANCE: if (est.lookup < best.lookup) break; - if (est.lookup == best.lookup && est.size < best.size) - break; + if (est.lookup == best.lookup) { + if (!desc->size) { + if (est.space < best.space) + break; + } else if (est.size < best.size) { + break; + } + } continue; case NFT_SET_POL_MEMORY: - if (est.size < best.size) - break; - if (est.size == best.size && est.lookup < best.lookup) + if (!desc->size) { + if (est.space < best.space) + break; + if (est.space == best.space && + est.lookup < best.lookup) + break; + } else if (est.size < best.size) { break; + } continue; default: break; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index e58e7f02138b..6938bc890f31 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -385,6 +385,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; return true; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2b6ea10c4bbd..3387ed7dd231 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -292,6 +292,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, est->size = nsize; est->lookup = NFT_SET_CLASS_O_LOG_N; + est->space = NFT_SET_CLASS_O_N; return true; } From 665153ff575207f3a092cfcea3c51238612a7b58 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 18 Jan 2017 18:30:13 +0100 Subject: [PATCH 08/21] netfilter: nf_tables: add bitmap set type This patch adds a new bitmap set type. This bitmap uses two bits to represent one element. These two bits determine the element state in the current and the future generation that fits into the nf_tables commit protocol. When dumping elements back to userspace, the two bits are expanded into a struct nft_set_ext object. If no NFTA_SET_DESC_SIZE is specified, the existing automatic set backend selection prefers bitmap over hash in case of keys whose size is <= 16 bit. If the set size is know, the bitmap set type is selected if with 16 bit kets and more than 390 elements in the set, otherwise the hash table set implementation is used. For 8 bit keys, the bitmap consumes 66 bytes. For 16 bit keys, the bitmap takes 16388 bytes. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 6 + net/netfilter/Makefile | 1 + net/netfilter/nft_set_bitmap.c | 314 +++++++++++++++++++++++++++++++++ 3 files changed, 321 insertions(+) create mode 100644 net/netfilter/nft_set_bitmap.c diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index dfbe9deeb8c4..ea479ed43373 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -509,6 +509,12 @@ config NFT_SET_HASH This option adds the "hash" set type that is used to build one-way mappings between matchings and actions. +config NFT_SET_BITMAP + tristate "Netfilter nf_tables bitmap set module" + help + This option adds the "bitmap" set type that is used to build sets + whose keys are smaller or equal to 16 bits. + config NFT_COUNTER tristate "Netfilter nf_tables counter module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 6b3034f12661..c9b78e7b342f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o +obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c new file mode 100644 index 000000000000..97f9649bcc7e --- /dev/null +++ b/net/netfilter/nft_set_bitmap.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2017 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* This bitmap uses two bits to represent one element. These two bits determine + * the element state in the current and the future generation. + * + * An element can be in three states. The generation cursor is represented using + * the ^ character, note that this cursor shifts on every succesful transaction. + * If no transaction is going on, we observe all elements are in the following + * state: + * + * 11 = this element is active in the current generation. In case of no updates, + * ^ it stays active in the next generation. + * 00 = this element is inactive in the current generation. In case of no + * ^ updates, it stays inactive in the next generation. + * + * On transaction handling, we observe these two temporary states: + * + * 01 = this element is inactive in the current generation and it becomes active + * ^ in the next one. This happens when the element is inserted but commit + * path has not yet been executed yet, so activation is still pending. On + * transaction abortion, the element is removed. + * 10 = this element is active in the current generation and it becomes inactive + * ^ in the next one. This happens when the element is deactivated but commit + * path has not yet been executed yet, so removal is still pending. On + * transation abortion, the next generation bit is reset to go back to + * restore its previous state. + */ +struct nft_bitmap { + u16 bitmap_size; + u8 bitmap[]; +}; + +static inline void nft_bitmap_location(u32 key, u32 *idx, u32 *off) +{ + u32 k = (key << 1); + + *idx = k / BITS_PER_BYTE; + *off = k % BITS_PER_BYTE; +} + +/* Fetch the two bits that represent the element and check if it is active based + * on the generation mask. + */ +static inline bool +nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) +{ + return (bitmap[idx] & (0x3 << off)) & (genmask << off); +} + +static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + u32 idx, off; + + nft_bitmap_location(*key, &idx, &off); + + return nft_bitmap_active(priv->bitmap, idx, off, genmask); +} + +static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **_ext) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return -EEXIST; + + /* Enter 01 state. */ + priv->bitmap[idx] |= (genmask << off); + + return 0; +} + +static void nft_bitmap_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 00 state. */ + priv->bitmap[idx] &= ~(genmask << off); +} + +static void nft_bitmap_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext *ext = elem->priv; + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 11 state. */ + priv->bitmap[idx] |= (genmask << off); +} + +static bool nft_bitmap_flush(const struct net *net, + const struct nft_set *set, void *ext) +{ + struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); + u32 idx, off; + + nft_bitmap_location(nft_set_ext_key(ext)->data[0], &idx, &off); + /* Enter 10 state, similar to deactivation. */ + priv->bitmap[idx] &= ~(genmask << off); + + return true; +} + +static struct nft_set_ext *nft_bitmap_ext_alloc(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_ext_tmpl tmpl; + struct nft_set_ext *ext; + + nft_set_ext_prepare(&tmpl); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + ext = kzalloc(tmpl.len, GFP_KERNEL); + if (!ext) + return NULL; + + nft_set_ext_init(ext, &tmpl); + memcpy(nft_set_ext_key(ext), elem->key.val.data, set->klen); + + return ext; +} + +static void *nft_bitmap_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_bitmap *priv = nft_set_priv(set); + u8 genmask = nft_genmask_next(net); + struct nft_set_ext *ext; + u32 idx, off, key = 0; + + memcpy(&key, elem->key.val.data, set->klen); + nft_bitmap_location(key, &idx, &off); + + if (!nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return NULL; + + /* We have no real set extension since this is a bitmap, allocate this + * dummy object that is released from the commit/abort path. + */ + ext = nft_bitmap_ext_alloc(set, elem); + if (!ext) + return NULL; + + /* Enter 10 state. */ + priv->bitmap[idx] &= ~(genmask << off); + + return ext; +} + +static void nft_bitmap_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + const struct nft_bitmap *priv = nft_set_priv(set); + struct nft_set_ext_tmpl tmpl; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int idx, off; + u16 key; + + nft_set_ext_prepare(&tmpl); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + for (idx = 0; idx < priv->bitmap_size; idx++) { + for (off = 0; off < BITS_PER_BYTE; off += 2) { + if (iter->count < iter->skip) + goto cont; + + if (!nft_bitmap_active(priv->bitmap, idx, off, + iter->genmask)) + goto cont; + + ext = kzalloc(tmpl.len, GFP_KERNEL); + if (!ext) { + iter->err = -ENOMEM; + return; + } + nft_set_ext_init(ext, &tmpl); + key = ((idx * BITS_PER_BYTE) + off) >> 1; + memcpy(nft_set_ext_key(ext), &key, set->klen); + + elem.priv = ext; + iter->err = iter->fn(ctx, set, iter, &elem); + + /* On set flush, this dummy extension object is released + * from the commit/abort path. + */ + if (!iter->flush) + kfree(ext); + + if (iter->err < 0) + return; +cont: + iter->count++; + } + } +} + +/* The bitmap size is pow(2, key length in bits) / bits per byte. This is + * multiplied by two since each element takes two bits. For 8 bit keys, the + * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes. + */ +static inline u32 nft_bitmap_size(u32 klen) +{ + return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; +} + +static inline u32 nft_bitmap_total_size(u32 klen) +{ + return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); +} + +static unsigned int nft_bitmap_privsize(const struct nlattr * const nla[]) +{ + u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); + + return nft_bitmap_total_size(klen); +} + +static int nft_bitmap_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_bitmap *priv = nft_set_priv(set); + + priv->bitmap_size = nft_bitmap_total_size(set->klen); + + return 0; +} + +static void nft_bitmap_destroy(const struct nft_set *set) +{ +} + +static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ + if (desc->klen > 2) + return false; + + est->size = nft_bitmap_total_size(desc->klen); + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_1; + + return true; +} + +static struct nft_set_ops nft_bitmap_ops __read_mostly = { + .privsize = nft_bitmap_privsize, + .estimate = nft_bitmap_estimate, + .init = nft_bitmap_init, + .destroy = nft_bitmap_destroy, + .insert = nft_bitmap_insert, + .remove = nft_bitmap_remove, + .deactivate = nft_bitmap_deactivate, + .flush = nft_bitmap_flush, + .activate = nft_bitmap_activate, + .lookup = nft_bitmap_lookup, + .walk = nft_bitmap_walk, + .owner = THIS_MODULE, +}; + +static int __init nft_bitmap_module_init(void) +{ + return nft_register_set(&nft_bitmap_ops); +} + +static void __exit nft_bitmap_module_exit(void) +{ + nft_unregister_set(&nft_bitmap_ops); +} + +module_init(nft_bitmap_module_init); +module_exit(nft_bitmap_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_SET(); From ab23821f7ecfb022a4aec78fb6f4fd0f6aa1ccab Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Feb 2017 13:35:48 +0100 Subject: [PATCH 09/21] netfilter: nft_ct: add zone id get support Just like with counters the direction attribute is optional. We set priv->dir to MAX unconditionally to avoid duplicating the assignment for all keys with optional direction. For keys where direction is mandatory, existing code already returns an error. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_ct.c | 22 +++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 53aac8b8ed6b..3e60ed78c538 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -870,6 +870,7 @@ enum nft_rt_attributes { * @NFT_CT_PKTS: conntrack packets * @NFT_CT_BYTES: conntrack bytes * @NFT_CT_AVGPKT: conntrack average bytes per packet + * @NFT_CT_ZONE: conntrack zone */ enum nft_ct_keys { NFT_CT_STATE, @@ -889,6 +890,7 @@ enum nft_ct_keys { NFT_CT_PKTS, NFT_CT_BYTES, NFT_CT_AVGPKT, + NFT_CT_ZONE, }; /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 66a2377510e1..5bd4cdfdcda5 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -151,6 +151,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr, case NFT_CT_PROTOCOL: *dest = nf_ct_protonum(ct); return; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: { + const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + + if (priv->dir < IP_CT_DIR_MAX) + *dest = nf_ct_zone_id(zone, priv->dir); + else + *dest = zone->id; + + return; + } +#endif default: break; } @@ -266,6 +278,7 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); + priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) @@ -333,11 +346,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: - /* no direction? return sum of original + reply */ - if (tb[NFTA_CT_DIRECTION] == NULL) - priv->dir = IP_CT_DIR_MAX; len = sizeof(u64); break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + len = sizeof(u16); + break; +#endif default: return -EOPNOTSUPP; } @@ -465,6 +480,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: + case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; From 5c178d81b69f08ca3195427a6ea9a46d9af23127 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Feb 2017 13:35:49 +0100 Subject: [PATCH 10/21] netfilter: nft_ct: prepare for key-dependent error unwind Next patch will add ZONE_ID set support which will need similar error unwind (put operation) as conntrack labels. Prepare for this: remove the 'label_got' boolean in favor of a switch statement that can be extended in next patch. As we already have that in the set_destroy function place that in a separate function and call it from the set init function. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 5bd4cdfdcda5..2d82df2737da 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -386,12 +386,24 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return 0; } +static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } +} + static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); - bool label_got = false; unsigned int len; int err; @@ -412,7 +424,6 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; - label_got = true; break; #endif default: @@ -431,8 +442,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, return 0; err1: - if (label_got) - nf_connlabels_put(ctx->net); + __nft_ct_set_destroy(ctx, priv); return err; } @@ -447,16 +457,7 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, { struct nft_ct *priv = nft_expr_priv(expr); - switch (priv->key) { -#ifdef CONFIG_NF_CONNTRACK_LABELS - case NFT_CT_LABELS: - nf_connlabels_put(ctx->net); - break; -#endif - default: - break; - } - + __nft_ct_set_destroy(ctx, priv); nft_ct_netns_put(ctx->net, ctx->afi->family); } From edee4f1e92458299505ff007733f676b00c516a1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 3 Feb 2017 13:35:50 +0100 Subject: [PATCH 11/21] netfilter: nft_ct: add zone id set support zones allow tracking multiple connections sharing identical tuples, this is needed e.g. when tracking distinct vlans with overlapping ip addresses (conntrack is l2 agnostic). Thus the zone has to be set before the packet is picked up by the connection tracker. This is done by means of 'conntrack templates' which are conntrack structures used solely to pass this info from one netfilter hook to the next. The iptables CT target instantiates these connection tracking templates once per rule, i.e. the template is fixed/tied to particular zone, can be read-only and therefore be re-used by as many skbs simultaneously as needed. We can't follow this model because we want to take the zone id from an sreg at rule eval time so we could e.g. fill in the zone id from the packets vlan id or a e.g. nftables key : value maps. To avoid cost of per packet alloc/free of the template, use a percpu template 'scratch' object and use the refcount to detect the (unlikely) case where the template is still attached to another skb (i.e., previous skb was nfqueued ...). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 144 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 2d82df2737da..c6b8022c0e47 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -32,6 +32,11 @@ struct nft_ct { }; }; +#ifdef CONFIG_NF_CONNTRACK_ZONES +static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); +static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +#endif + static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) @@ -191,6 +196,53 @@ err: regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void nft_ct_set_zone_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; + const struct nft_ct *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + enum ip_conntrack_info ctinfo; + u16 value = regs->data[priv->sreg]; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) /* already tracked */ + return; + + zone.id = value; + + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + zone.dir = NF_CT_ZONE_DIR_ORIG; + break; + case IP_CT_DIR_REPLY: + zone.dir = NF_CT_ZONE_DIR_REPL; + break; + default: + break; + } + + ct = this_cpu_read(nft_ct_pcpu_template); + + if (likely(atomic_read(&ct->ct_general.use) == 1)) { + nf_ct_zone_add(ct, &zone); + } else { + /* previous skb got queued to userspace */ + ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); + if (!ct) { + regs->verdict.code = NF_DROP; + return; + } + } + + atomic_inc(&ct->ct_general.use); + nf_ct_set(skb, ct, IP_CT_NEW); +} +#endif + static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -269,6 +321,45 @@ static void nft_ct_netns_put(struct net *net, uint8_t family) nf_ct_netns_put(net, family); } +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void nft_ct_tmpl_put_pcpu(void) +{ + struct nf_conn *ct; + int cpu; + + for_each_possible_cpu(cpu) { + ct = per_cpu(nft_ct_pcpu_template, cpu); + if (!ct) + break; + nf_ct_put(ct); + per_cpu(nft_ct_pcpu_template, cpu) = NULL; + } +} + +static bool nft_ct_tmpl_alloc_pcpu(void) +{ + struct nf_conntrack_zone zone = { .id = 0 }; + struct nf_conn *tmp; + int cpu; + + if (nft_ct_pcpu_template_refcnt) + return true; + + for_each_possible_cpu(cpu) { + tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); + if (!tmp) { + nft_ct_tmpl_put_pcpu(); + return false; + } + + atomic_set(&tmp->ct_general.use, 1); + per_cpu(nft_ct_pcpu_template, cpu) = tmp; + } + + return true; +} +#endif + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -393,6 +484,11 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) case NFT_CT_LABELS: nf_connlabels_put(ctx->net); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + if (--nft_ct_pcpu_template_refcnt == 0) + nft_ct_tmpl_put_pcpu(); #endif default: break; @@ -407,6 +503,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, unsigned int len; int err; + priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK @@ -425,11 +522,29 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, if (err) return err; break; +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES + case NFT_CT_ZONE: + if (!nft_ct_tmpl_alloc_pcpu()) + return -ENOMEM; + nft_ct_pcpu_template_refcnt++; + break; #endif default: return -EOPNOTSUPP; } + if (tb[NFTA_CT_DIRECTION]) { + priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); + switch (priv->dir) { + case IP_CT_DIR_ORIGINAL: + case IP_CT_DIR_REPLY: + break; + default: + return -EINVAL; + } + } + priv->sreg = nft_parse_register(tb[NFTA_CT_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -504,6 +619,17 @@ static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; + + switch (priv->key) { + case NFT_CT_ZONE: + if (priv->dir < IP_CT_DIR_MAX && + nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) + goto nla_put_failure; + break; + default: + break; + } + return 0; nla_put_failure: @@ -529,6 +655,17 @@ static const struct nft_expr_ops nft_ct_set_ops = { .dump = nft_ct_set_dump, }; +#ifdef CONFIG_NF_CONNTRACK_ZONES +static const struct nft_expr_ops nft_ct_set_zone_ops = { + .type = &nft_ct_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), + .eval = nft_ct_set_zone_eval, + .init = nft_ct_set_init, + .destroy = nft_ct_set_destroy, + .dump = nft_ct_set_dump, +}; +#endif + static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -542,8 +679,13 @@ nft_ct_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_CT_DREG]) return &nft_ct_get_ops; - if (tb[NFTA_CT_SREG]) + if (tb[NFTA_CT_SREG]) { +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) + return &nft_ct_set_zone_ops; +#endif return &nft_ct_set_ops; + } return ERR_PTR(-EINVAL); } From 935b7f643018878bd9d4193eea8b575aff736b9b Mon Sep 17 00:00:00 2001 From: Manuel Messner Date: Tue, 7 Feb 2017 03:14:53 +0100 Subject: [PATCH 12/21] netfilter: nft_exthdr: add TCP option matching This patch implements the kernel side of the TCP option patch. Signed-off-by: Manuel Messner Reviewed-by: Florian Westphal Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 17 +++- net/netfilter/Kconfig | 4 +- net/netfilter/nft_exthdr.c | 119 ++++++++++++++++++++--- 3 files changed, 124 insertions(+), 16 deletions(-) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3e60ed78c538..207951516ede 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -709,13 +709,27 @@ enum nft_exthdr_flags { }; /** - * enum nft_exthdr_attributes - nf_tables IPv6 extension header expression netlink attributes + * enum nft_exthdr_op - nf_tables match options + * + * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers + * @NFT_EXTHDR_OP_TCP: match against tcp options + */ +enum nft_exthdr_op { + NFT_EXTHDR_OP_IPV6, + NFT_EXTHDR_OP_TCPOPT, + __NFT_EXTHDR_OP_MAX +}; +#define NFT_EXTHDR_OP_MAX (__NFT_EXTHDR_OP_MAX - 1) + +/** + * enum nft_exthdr_attributes - nf_tables extension header expression netlink attributes * * @NFTA_EXTHDR_DREG: destination register (NLA_U32: nft_registers) * @NFTA_EXTHDR_TYPE: extension header type (NLA_U8) * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) + * @NFTA_EXTHDR_OP: option match type (NLA_U8) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -724,6 +738,7 @@ enum nft_exthdr_attributes { NFTA_EXTHDR_OFFSET, NFTA_EXTHDR_LEN, NFTA_EXTHDR_FLAGS, + NFTA_EXTHDR_OP, __NFTA_EXTHDR_MAX }; #define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ea479ed43373..9b28864cc36a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -467,10 +467,10 @@ config NF_TABLES_NETDEV This option enables support for the "netdev" table. config NFT_EXTHDR - tristate "Netfilter nf_tables IPv6 exthdr module" + tristate "Netfilter nf_tables exthdr module" help This option adds the "exthdr" expression that you can use to match - IPv6 extension headers. + IPv6 extension headers and tcp options. config NFT_META tristate "Netfilter nf_tables meta module" diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a89e5ab150db..c308920b194c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -15,20 +15,29 @@ #include #include #include -// FIXME: -#include +#include struct nft_exthdr { u8 type; u8 offset; u8 len; + u8 op; enum nft_registers dreg:8; u8 flags; }; -static void nft_exthdr_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static unsigned int optlen(const u8 *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset + 1] == 0) + return 1; + else + return opt[offset + 1]; +} + +static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_exthdr *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; @@ -52,6 +61,53 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_tcp_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, optl, tcphdr_len, offset; + u32 *dest = ®s->data[priv->dreg]; + struct tcphdr *tcph; + u8 *opt; + + if (!pkt->tprot_set || pkt->tprot != IPPROTO_TCP) + goto err; + + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, sizeof(*tcph), buff); + if (!tcph) + goto err; + + tcphdr_len = __tcp_hdrlen(tcph); + if (tcphdr_len < sizeof(*tcph)) + goto err; + + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, tcphdr_len, buff); + if (!tcph) + goto err; + + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + optl = optlen(opt, i); + + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len || priv->len + priv->offset > optl) + goto err; + + offset = i + priv->offset; + dest[priv->len / NFT_REG32_SIZE] = 0; + memcpy(dest, opt + offset, priv->len); + + return; + } + +err: + regs->verdict.code = NFT_BREAK; +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -65,13 +121,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len, flags = 0; + u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; int err; - if (tb[NFTA_EXTHDR_DREG] == NULL || - tb[NFTA_EXTHDR_TYPE] == NULL || - tb[NFTA_EXTHDR_OFFSET] == NULL || - tb[NFTA_EXTHDR_LEN] == NULL) + if (!tb[NFTA_EXTHDR_DREG] || + !tb[NFTA_EXTHDR_TYPE] || + !tb[NFTA_EXTHDR_OFFSET] || + !tb[NFTA_EXTHDR_LEN]) return -EINVAL; err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); @@ -91,11 +147,18 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, return -EINVAL; } + if (tb[NFTA_EXTHDR_OP]) { + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); + if (err < 0) + return err; + } + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; priv->len = len; priv->dreg = nft_parse_register(tb[NFTA_EXTHDR_DREG]); priv->flags = flags; + priv->op = op; return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, priv->len); @@ -115,6 +178,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_be32(skb, NFTA_EXTHDR_FLAGS, htonl(priv->flags))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_EXTHDR_OP, htonl(priv->op))) + goto nla_put_failure; return 0; nla_put_failure: @@ -122,17 +187,45 @@ nla_put_failure: } static struct nft_expr_type nft_exthdr_type; -static const struct nft_expr_ops nft_exthdr_ops = { +static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), - .eval = nft_exthdr_eval, + .eval = nft_exthdr_ipv6_eval, .init = nft_exthdr_init, .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_tcp_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_eval, + .init = nft_exthdr_init, + .dump = nft_exthdr_dump, +}; + +static const struct nft_expr_ops * +nft_exthdr_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + u32 op; + + if (!tb[NFTA_EXTHDR_OP]) + return &nft_exthdr_ipv6_ops; + + op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP])); + switch (op) { + case NFT_EXTHDR_OP_TCPOPT: + return &nft_exthdr_tcp_ops; + case NFT_EXTHDR_OP_IPV6: + return &nft_exthdr_ipv6_ops; + } + + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", - .ops = &nft_exthdr_ops, + .select_ops = &nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, From a96e66e702bc615f5927b15c739c91d5098dac59 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 9 Feb 2017 16:46:45 +0800 Subject: [PATCH 13/21] netfilter: nf_ct_sip: Use mod_timer_pending() timer_del() followed by timer_add() can be replaced by mod_timer_pending(). Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index c3fc14e021ec..24174c520239 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -809,13 +809,11 @@ static int refresh_signalling_expectation(struct nf_conn *ct, exp->tuple.dst.protonum != proto || exp->tuple.dst.u.udp.port != port) continue; - if (!del_timer(&exp->timeout)) - continue; - exp->flags &= ~NF_CT_EXPECT_INACTIVE; - exp->timeout.expires = jiffies + expires * HZ; - add_timer(&exp->timeout); - found = 1; - break; + if (mod_timer_pending(&exp->timeout, jiffies + expires * HZ)) { + exp->flags &= ~NF_CT_EXPECT_INACTIVE; + found = 1; + break; + } } spin_unlock_bh(&nf_conntrack_expect_lock); return found; From 4dee62b1b9b43c9bbf49c93cd114e1bf1334fb6a Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 9 Feb 2017 21:40:38 +0800 Subject: [PATCH 14/21] netfilter: nf_ct_expect: nf_ct_expect_insert() returns void Because nf_ct_expect_insert() always succeeds now, its return value can be just void instead of int. And remove code that checks for its return value. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f8dbacf66795..e19a69787d99 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -353,7 +353,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp) } EXPORT_SYMBOL_GPL(nf_ct_expect_put); -static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) +static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; @@ -380,7 +380,6 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) add_timer(&exp->timeout); NF_CT_STAT_INC(net, expect_create); - return 0; } /* Race with expectations being used means we could have none to find; OK. */ @@ -464,9 +463,8 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, if (ret <= 0) goto out; - ret = nf_ct_expect_insert(expect); - if (ret < 0) - goto out; + nf_ct_expect_insert(expect); + spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; From b745d0358db44ebcef31478314436d6d4ff93bfa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:08 +0100 Subject: [PATCH 15/21] netfilter: nfnetlink: get rid of u_intX_t types Use uX types instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index a09fa9fd8f3d..586212ebba9e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -100,9 +100,9 @@ int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n) } EXPORT_SYMBOL_GPL(nfnetlink_subsys_unregister); -static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t type) +static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u16 type) { - u_int8_t subsys_id = NFNL_SUBSYS_ID(type); + u8 subsys_id = NFNL_SUBSYS_ID(type); if (subsys_id >= NFNL_SUBSYS_COUNT) return NULL; @@ -111,9 +111,9 @@ static inline const struct nfnetlink_subsystem *nfnetlink_get_subsys(u_int16_t t } static inline const struct nfnl_callback * -nfnetlink_find_client(u_int16_t type, const struct nfnetlink_subsystem *ss) +nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) { - u_int8_t cb_id = NFNL_MSG_TYPE(type); + u8 cb_id = NFNL_MSG_TYPE(type); if (cb_id >= ss->cb_count) return NULL; @@ -185,7 +185,7 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; @@ -273,7 +273,7 @@ enum { }; static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, - u_int16_t subsys_id) + u16 subsys_id) { struct sk_buff *oskb = skb; struct net *net = sock_net(skb->sk); @@ -365,7 +365,7 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); - u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; @@ -439,7 +439,7 @@ done: static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - u_int16_t res_id; + u16 res_id; int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || From 48656835c0405aa2e6c0d6a4305c77b70758d168 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:14 +0100 Subject: [PATCH 16/21] netfilter: nfnetlink: add nfnetlink_rcv_skb_batch() Add new nfnetlink_rcv_skb_batch() to wrap initial nfnetlink batch handling. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 51 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 586212ebba9e..ca645a3b1375 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -436,11 +436,34 @@ done: kfree_skb(skb); } +static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct nfgenmsg *nfgenmsg; + u16 res_id; + int msglen; + + msglen = NLMSG_ALIGN(nlh->nlmsg_len); + if (msglen > skb->len) + msglen = skb->len; + + if (nlh->nlmsg_len < NLMSG_HDRLEN || + skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) + return; + + nfgenmsg = nlmsg_data(nlh); + skb_pull(skb, msglen); + /* Work around old nft using host byte order */ + if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) + res_id = NFNL_SUBSYS_NFTABLES; + else + res_id = ntohs(nfgenmsg->res_id); + + nfnetlink_rcv_batch(skb, nlh, res_id); +} + static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - u16 res_id; - int msglen; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) @@ -451,28 +474,10 @@ static void nfnetlink_rcv(struct sk_buff *skb) return; } - if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) { - struct nfgenmsg *nfgenmsg; - - msglen = NLMSG_ALIGN(nlh->nlmsg_len); - if (msglen > skb->len) - msglen = skb->len; - - if (nlh->nlmsg_len < NLMSG_HDRLEN || - skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) - return; - - nfgenmsg = nlmsg_data(nlh); - skb_pull(skb, msglen); - /* Work around old nft using host byte order */ - if (nfgenmsg->res_id == NFNL_SUBSYS_NFTABLES) - res_id = NFNL_SUBSYS_NFTABLES; - else - res_id = ntohs(nfgenmsg->res_id); - nfnetlink_rcv_batch(skb, nlh, res_id); - } else { + if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) + nfnetlink_rcv_skb_batch(skb, nlh); + else netlink_rcv_skb(skb, &nfnetlink_rcv_msg); - } } #ifdef CONFIG_MODULES From 8c4d4e8b5626fec965fd5034e5bd5e57790f243f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:17 +0100 Subject: [PATCH 17/21] netfilter: nfnetlink: allow to check for generation ID This patch allows userspace to specify the generation ID that has been used to build an incremental batch update. If userspace specifies the generation ID in the batch message as attribute, then nfnetlink compares it to the current generation ID so you make sure that you work against the right baseline. Otherwise, bail out with ERESTART so userspace knows that its changeset is stale and needs to respin. Userspace can do this transparently at the cost of taking slightly more time to refresh caches and rework the changeset. This check is optional, if there is no NFNL_BATCH_GENID attribute in the batch begin message, then no check is performed. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 + include/uapi/linux/netfilter/nfnetlink.h | 12 +++++++++ net/netfilter/nfnetlink.c | 31 +++++++++++++++++++++--- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 1d82dd5e9a08..1b49209dd5c7 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -28,6 +28,7 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); + bool (*valid_genid)(struct net *net, u32 genid); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 4bb8cb7730e7..a09906a30d77 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -65,4 +65,16 @@ struct nfgenmsg { #define NFNL_MSG_BATCH_BEGIN NLMSG_MIN_TYPE #define NFNL_MSG_BATCH_END NLMSG_MIN_TYPE+1 +/** + * enum nfnl_batch_attributes - nfnetlink batch netlink attributes + * + * @NFNL_BATCH_GENID: generation ID for this changeset (NLA_U32) + */ +enum nfnl_batch_attributes { + NFNL_BATCH_UNSPEC, + NFNL_BATCH_GENID, + __NFNL_BATCH_MAX +}; +#define NFNL_BATCH_MAX (__NFNL_BATCH_MAX - 1) + #endif /* _UAPI_NFNETLINK_H */ diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index ca645a3b1375..a2148d0bc50e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -3,7 +3,7 @@ * * (C) 2001 by Jay Schulist , * (C) 2002-2005 by Harald Welte - * (C) 2005,2007 by Pablo Neira Ayuso + * (C) 2005-2017 by Pablo Neira Ayuso * * Initial netfilter messages via netlink development funded and * generally made possible by Network Robots, Inc. (www.networkrobots.com) @@ -273,7 +273,7 @@ enum { }; static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, - u16 subsys_id) + u16 subsys_id, u32 genid) { struct sk_buff *oskb = skb; struct net *net = sock_net(skb->sk); @@ -315,6 +315,12 @@ replay: return kfree_skb(skb); } + if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { + nfnl_unlock(subsys_id); + netlink_ack(oskb, nlh, -ERESTART); + return kfree_skb(skb); + } + while (skb->len >= nlmsg_total_size(0)) { int msglen, type; @@ -436,11 +442,20 @@ done: kfree_skb(skb); } +static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = { + [NFNL_BATCH_GENID] = { .type = NLA_U32 }, +}; + static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) { + int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nlattr *attr = (void *)nlh + min_len; + struct nlattr *cda[NFNL_BATCH_MAX + 1]; + int attrlen = nlh->nlmsg_len - min_len; struct nfgenmsg *nfgenmsg; + int msglen, err; + u32 gen_id = 0; u16 res_id; - int msglen; msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) @@ -450,6 +465,14 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; + err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy); + if (err < 0) { + netlink_ack(skb, nlh, err); + return; + } + if (cda[NFNL_BATCH_GENID]) + gen_id = ntohl(nla_get_be32(cda[NFNL_BATCH_GENID])); + nfgenmsg = nlmsg_data(nlh); skb_pull(skb, msglen); /* Work around old nft using host byte order */ @@ -458,7 +481,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) else res_id = ntohs(nfgenmsg->res_id); - nfnetlink_rcv_batch(skb, nlh, res_id); + nfnetlink_rcv_batch(skb, nlh, res_id, gen_id); } static void nfnetlink_rcv(struct sk_buff *skb) From 74e8bcd21c40dbbb3d74fa904536f8a3bddafed3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:20 +0100 Subject: [PATCH 18/21] netfilter: nf_tables: add check_genid to the nfnetlink subsystem This patch implements the check generation id as provided by nfnetlink. This allows us to reject ruleset updates against stale baseline, so userspace can retry update with a fresh ruleset cache. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cb6ae46f6c48..71c60a04b66b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4972,6 +4972,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) return 0; } +static bool nf_tables_valid_genid(struct net *net, u32 genid) +{ + return net->nft.base_seq == genid; +} + static const struct nfnetlink_subsystem nf_tables_subsys = { .name = "nf_tables", .subsys_id = NFNL_SUBSYS_NFTABLES, @@ -4979,6 +4984,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, + .valid_genid = nf_tables_valid_genid, }; int nft_chain_validate_dependency(const struct nft_chain *chain, From 1a94e38d254b3622d5d53f74b3b716b0fcab0ba8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:23 +0100 Subject: [PATCH 19/21] netfilter: nf_tables: add NFTA_RULE_ID attribute This new attribute allows us to uniquely identify a rule in transaction. Robots may trigger an insertion followed by deletion in a batch, in that scenario we still don't have a public rule handle that we can use to delete the rule. This is similar to the NFTA_SET_ID attribute that allows us to refer to an anonymous set from a batch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 +++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 26 ++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 21ce50e6d0c5..ac84686aaafb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1202,10 +1202,13 @@ struct nft_trans { struct nft_trans_rule { struct nft_rule *rule; + u32 rule_id; }; #define nft_trans_rule(trans) \ (((struct nft_trans_rule *)trans->data)->rule) +#define nft_trans_rule_id(trans) \ + (((struct nft_trans_rule *)trans->data)->rule_id) struct nft_trans_set { struct nft_set *set; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 207951516ede..05215d30fe5c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -207,6 +207,7 @@ enum nft_chain_attributes { * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes) * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64) * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN) + * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32) */ enum nft_rule_attributes { NFTA_RULE_UNSPEC, @@ -218,6 +219,7 @@ enum nft_rule_attributes { NFTA_RULE_POSITION, NFTA_RULE_USERDATA, NFTA_RULE_PAD, + NFTA_RULE_ID, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 71c60a04b66b..6c782532615f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -240,6 +240,10 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, if (trans == NULL) return NULL; + if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) { + nft_trans_rule_id(trans) = + ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); + } nft_trans_rule(trans) = rule; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -2293,6 +2297,22 @@ err1: return err; } +static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nlattr *nla) +{ + u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; + + list_for_each_entry(trans, &net->nft.commit_list, list) { + struct nft_rule *rule = nft_trans_rule(trans); + + if (trans->msg_type == NFT_MSG_NEWRULE && + id == nft_trans_rule_id(trans)) + return rule; + } + return ERR_PTR(-ENOENT); +} + static int nf_tables_delrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -2330,6 +2350,12 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, if (IS_ERR(rule)) return PTR_ERR(rule); + err = nft_delrule(&ctx, rule); + } else if (nla[NFTA_RULE_ID]) { + rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); + if (IS_ERR(rule)) + return PTR_ERR(rule); + err = nft_delrule(&ctx, rule); } else { err = nft_delrule_by_chain(&ctx); From fc52497eb9dd031de9d17af54ac8f4078663a9f5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 12:08:29 +0100 Subject: [PATCH 20/21] netfilter: update MAINTAINERS It's been a while since Patrick has been suspended as coreteam member [1]. Update this file to remove him. While at this, remove references to all foo-tables variants, given the project hosts more than just that, eg. ipset, conntrack, ... [1] https://marc.info/?l=netfilter-devel&m=146887464512702 Signed-off-by: Pablo Neira Ayuso --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index a9368bba9b37..5864bbd99f8f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8579,9 +8579,8 @@ F: Documentation/networking/s2io.txt F: Documentation/networking/vxge.txt F: drivers/net/ethernet/neterion/ -NETFILTER ({IP,IP6,ARP,EB,NF}TABLES) +NETFILTER M: Pablo Neira Ayuso -M: Patrick McHardy M: Jozsef Kadlecsik L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org From 7286ff7fde9f963736c7e575572899d8e16b06b7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Feb 2017 19:59:36 +0100 Subject: [PATCH 21/21] netfilter: nf_tables: honor NFT_SET_OBJECT in set backend selection Check for NFT_SET_OBJECT feature flag, otherwise we may end up selecting the wrong set backend. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- net/netfilter/nft_set_hash.c | 2 +- net/netfilter/nft_set_rbtree.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6c782532615f..ff7304ae58ac 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2424,7 +2424,8 @@ nft_select_set_ops(const struct nlattr * const nla[], features = 0; if (nla[NFTA_SET_FLAGS] != NULL) { features = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT; + features &= NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_TIMEOUT | + NFT_SET_OBJECT; } bops = NULL; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6938bc890f31..5f652720fc78 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -404,7 +404,7 @@ static struct nft_set_ops nft_hash_ops __read_mostly = { .lookup = nft_hash_lookup, .update = nft_hash_update, .walk = nft_hash_walk, - .features = NFT_SET_MAP | NFT_SET_TIMEOUT, + .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .owner = THIS_MODULE, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3387ed7dd231..71e8fb886a73 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -310,7 +310,7 @@ static struct nft_set_ops nft_rbtree_ops __read_mostly = { .activate = nft_rbtree_activate, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, - .features = NFT_SET_INTERVAL | NFT_SET_MAP, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, .owner = THIS_MODULE, };