rtnetlink: Add per-netns RTNL.

The goal is to break RTNL down into per-netns mutex.

This patch adds per-netns mutex and its helper functions, rtnl_net_lock()
and rtnl_net_unlock().

rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and
rtnl_net_unlock() releases them.

We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes
rtnl_lock() in rtnl_net_lock().

When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(),
and its locking order is defined by rtnl_net_lock_cmp_fn() as follows:

  1. init_net is first
  2. netns address ascending order

Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL
with LOCKDEP so that we can carefully add the extra mutex without slowing
down RTNL operations during conversion.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
Kuniyuki Iwashima 2024-10-04 15:10:29 -07:00 committed by Paolo Abeni
parent ec763c234d
commit 76aed95319
5 changed files with 104 additions and 0 deletions

View File

@ -92,6 +92,27 @@ static inline bool lockdep_rtnl_is_held(void)
#define rcu_replace_pointer_rtnl(rp, p) \ #define rcu_replace_pointer_rtnl(rp, p) \
rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
void __rtnl_net_lock(struct net *net);
void __rtnl_net_unlock(struct net *net);
void rtnl_net_lock(struct net *net);
void rtnl_net_unlock(struct net *net);
int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
#else
static inline void __rtnl_net_lock(struct net *net) {}
static inline void __rtnl_net_unlock(struct net *net) {}
static inline void rtnl_net_lock(struct net *net)
{
rtnl_lock();
}
static inline void rtnl_net_unlock(struct net *net)
{
rtnl_unlock();
}
#endif
static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{ {
return rtnl_dereference(dev->ingress_queue); return rtnl_dereference(dev->ingress_queue);

View File

@ -188,6 +188,10 @@ struct net {
#if IS_ENABLED(CONFIG_SMC) #if IS_ENABLED(CONFIG_SMC)
struct netns_smc smc; struct netns_smc smc;
#endif #endif
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
/* Move to a better place when the config guard is removed. */
struct mutex rtnl_mutex;
#endif
} __randomize_layout; } __randomize_layout;
#include <linux/seq_file_net.h> #include <linux/seq_file_net.h>

View File

@ -24,3 +24,18 @@ config DEBUG_NET
help help
Enable extra sanity checks in networking. Enable extra sanity checks in networking.
This is mostly used by fuzzers, but is safe to select. This is mostly used by fuzzers, but is safe to select.
config DEBUG_NET_SMALL_RTNL
bool "Add extra per-netns mutex inside RTNL"
depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT
select PROVE_LOCKING
default n
help
rtnl_lock() is being replaced with rtnl_net_lock() that
acquires the global RTNL and a small per-netns RTNL mutex.
During the conversion, rtnl_net_lock() just adds an extra
mutex in every RTNL scope and slows down the operations.
Once the conversion completes, rtnl_lock() will be removed
and rtnetlink will gain per-netns scalability.

View File

@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
idr_init(&net->netns_ids); idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock); spin_lock_init(&net->nsid_lock);
mutex_init(&net->ipv4.ra_mutex); mutex_init(&net->ipv4.ra_mutex);
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
mutex_init(&net->rtnl_mutex);
lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
#endif
preinit_net_sysctl(net); preinit_net_sysctl(net);
} }

View File

@ -179,6 +179,64 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held); EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */ #endif /* #ifdef CONFIG_PROVE_LOCKING */
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
void __rtnl_net_lock(struct net *net)
{
ASSERT_RTNL();
mutex_lock(&net->rtnl_mutex);
}
EXPORT_SYMBOL(__rtnl_net_lock);
void __rtnl_net_unlock(struct net *net)
{
ASSERT_RTNL();
mutex_unlock(&net->rtnl_mutex);
}
EXPORT_SYMBOL(__rtnl_net_unlock);
void rtnl_net_lock(struct net *net)
{
rtnl_lock();
__rtnl_net_lock(net);
}
EXPORT_SYMBOL(rtnl_net_lock);
void rtnl_net_unlock(struct net *net)
{
__rtnl_net_unlock(net);
rtnl_unlock();
}
EXPORT_SYMBOL(rtnl_net_unlock);
static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
if (net_eq(net_a, net_b))
return 0;
/* always init_net first */
if (net_eq(net_a, &init_net))
return -1;
if (net_eq(net_b, &init_net))
return 1;
/* otherwise lock in ascending order */
return net_a < net_b ? -1 : 1;
}
int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
{
const struct net *net_a, *net_b;
net_a = container_of(a, struct net, rtnl_mutex.dep_map);
net_b = container_of(b, struct net, rtnl_mutex.dep_map);
return rtnl_net_cmp_locks(net_a, net_b);
}
#endif
static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype) static inline int rtm_msgindex(int msgtype)