mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-16 02:14:58 +00:00
cb358ff941
syzbot presented an use-after-free report [0] regarding ipvlan and linkwatch. ipvlan does not hold a refcnt of the lower device unlike vlan and macvlan. If the linkwatch work is triggered for the ipvlan dev, the lower dev might have already been freed, resulting in UAF of ipvlan->phy_dev in ipvlan_get_iflink(). We can delay the lower dev unregistration like vlan and macvlan by holding the lower dev's refcnt in dev->netdev_ops->ndo_init() and releasing it in dev->priv_destructor(). Jakub pointed out calling .ndo_XXX after unregister_netdevice() has returned is error prone and suggested [1] addressing this UAF in the core by taking commit 750e51603395 ("net: avoid potential UAF in default_operstate()") further. Let's assume unregistering devices DOWN and use RCU protection in default_operstate() not to race with the device unregistration. [0]: BUG: KASAN: slab-use-after-free in ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 Read of size 4 at addr ffff0000d768c0e0 by task kworker/u8:35/6944 CPU: 0 UID: 0 PID: 6944 Comm: kworker/u8:35 Not tainted 6.13.0-rc2-g9bc5c9515b48 #12 4c3cb9e8b4565456f6a355f312ff91f4f29b3c47 Hardware name: linux,dummy-virt (DT) Workqueue: events_unbound linkwatch_event Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:484 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load4_noabort+0x20/0x30 mm/kasan/report_generic.c:380 ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 dev_get_iflink+0x7c/0xd8 net/core/dev.c:674 default_operstate net/core/link_watch.c:45 [inline] rfc2863_policy+0x144/0x360 net/core/link_watch.c:72 linkwatch_do_dev+0x60/0x228 net/core/link_watch.c:175 __linkwatch_run_queue+0x2f4/0x5b8 net/core/link_watch.c:239 linkwatch_event+0x64/0xa8 net/core/link_watch.c:282 process_one_work+0x700/0x1398 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3391 kthread+0x2b0/0x360 kernel/kthread.c:389 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 Allocated by task 9303: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4283 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4289 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:650 alloc_netdev_mqs+0xb4/0x1118 net/core/dev.c:11209 rtnl_create_link+0x2b8/0xb60 net/core/rtnetlink.c:3595 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3771 __rtnl_newlink net/core/rtnetlink.c:3896 [inline] rtnl_newlink+0x122c/0x15c0 net/core/rtnetlink.c:4011 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] __sys_sendto+0x2ec/0x438 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __arm64_sys_sendto+0xe4/0x110 net/socket.c:2200 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 10200: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x140/0x420 mm/slub.c:4746 kvfree+0x4c/0x68 mm/util.c:693 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2034 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xdd8/0xf48 net/core/dev.c:10924 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock net/core/rtnetlink.c:209 [inline] rtnl_dellink+0x484/0x680 net/core/rtnetlink.c:3526 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0x410/0x708 net/socket.c:2583 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2672 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 The buggy address belongs to the object at ffff0000d768c000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 224 bytes inside of freed 4096-byte region [ffff0000d768c000, ffff0000d768d000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117688 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff0000c77ef981 flags: 0xbfffe0000000040(head|node=0|zone=2|lastcpupid=0x1ffff) page_type: f5(slab) raw: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 head: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000003 fffffdffc35da201 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000d768bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff0000d768c000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff0000d768c080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000d768c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000d768c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzkaller <syzkaller@googlegroups.com> Suggested-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/netdev/20250102174400.085fd8ac@kernel.org/ [1] Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://patch.msgid.link/20250106071911.64355-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
303 lines
6.9 KiB
C
303 lines
6.9 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Linux network device link state notification
|
|
*
|
|
* Author:
|
|
* Stefan Rompf <sux@loplof.de>
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/if.h>
|
|
#include <net/sock.h>
|
|
#include <net/pkt_sched.h>
|
|
#include <linux/rtnetlink.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/types.h>
|
|
|
|
#include "dev.h"
|
|
|
|
enum lw_bits {
|
|
LW_URGENT = 0,
|
|
};
|
|
|
|
static unsigned long linkwatch_flags;
|
|
static unsigned long linkwatch_nextevent;
|
|
|
|
static void linkwatch_event(struct work_struct *dummy);
|
|
static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
|
|
|
|
static LIST_HEAD(lweventlist);
|
|
static DEFINE_SPINLOCK(lweventlist_lock);
|
|
|
|
static unsigned int default_operstate(const struct net_device *dev)
|
|
{
|
|
if (netif_testing(dev))
|
|
return IF_OPER_TESTING;
|
|
|
|
/* Some uppers (DSA) have additional sources for being down, so
|
|
* first check whether lower is indeed the source of its down state.
|
|
*/
|
|
if (!netif_carrier_ok(dev)) {
|
|
struct net_device *peer;
|
|
int iflink;
|
|
|
|
/* If called from netdev_run_todo()/linkwatch_sync_dev(),
|
|
* dev_net(dev) can be already freed, and RTNL is not held.
|
|
*/
|
|
if (dev->reg_state <= NETREG_REGISTERED)
|
|
iflink = dev_get_iflink(dev);
|
|
else
|
|
iflink = dev->ifindex;
|
|
|
|
if (iflink == dev->ifindex)
|
|
return IF_OPER_DOWN;
|
|
|
|
ASSERT_RTNL();
|
|
peer = __dev_get_by_index(dev_net(dev), iflink);
|
|
if (!peer)
|
|
return IF_OPER_DOWN;
|
|
|
|
return netif_carrier_ok(peer) ? IF_OPER_DOWN :
|
|
IF_OPER_LOWERLAYERDOWN;
|
|
}
|
|
|
|
if (netif_dormant(dev))
|
|
return IF_OPER_DORMANT;
|
|
|
|
return IF_OPER_UP;
|
|
}
|
|
|
|
static void rfc2863_policy(struct net_device *dev)
|
|
{
|
|
unsigned int operstate = default_operstate(dev);
|
|
|
|
if (operstate == READ_ONCE(dev->operstate))
|
|
return;
|
|
|
|
switch(dev->link_mode) {
|
|
case IF_LINK_MODE_TESTING:
|
|
if (operstate == IF_OPER_UP)
|
|
operstate = IF_OPER_TESTING;
|
|
break;
|
|
|
|
case IF_LINK_MODE_DORMANT:
|
|
if (operstate == IF_OPER_UP)
|
|
operstate = IF_OPER_DORMANT;
|
|
break;
|
|
case IF_LINK_MODE_DEFAULT:
|
|
default:
|
|
break;
|
|
}
|
|
|
|
WRITE_ONCE(dev->operstate, operstate);
|
|
}
|
|
|
|
|
|
void linkwatch_init_dev(struct net_device *dev)
|
|
{
|
|
/* Handle pre-registration link state changes */
|
|
if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
|
|
netif_testing(dev))
|
|
rfc2863_policy(dev);
|
|
}
|
|
|
|
|
|
static bool linkwatch_urgent_event(struct net_device *dev)
|
|
{
|
|
if (!netif_running(dev))
|
|
return false;
|
|
|
|
if (dev->ifindex != dev_get_iflink(dev))
|
|
return true;
|
|
|
|
if (netif_is_lag_port(dev) || netif_is_lag_master(dev))
|
|
return true;
|
|
|
|
return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
|
|
}
|
|
|
|
|
|
static void linkwatch_add_event(struct net_device *dev)
|
|
{
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&lweventlist_lock, flags);
|
|
if (list_empty(&dev->link_watch_list)) {
|
|
list_add_tail(&dev->link_watch_list, &lweventlist);
|
|
netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
|
|
}
|
|
spin_unlock_irqrestore(&lweventlist_lock, flags);
|
|
}
|
|
|
|
|
|
static void linkwatch_schedule_work(int urgent)
|
|
{
|
|
unsigned long delay = linkwatch_nextevent - jiffies;
|
|
|
|
if (test_bit(LW_URGENT, &linkwatch_flags))
|
|
return;
|
|
|
|
/* Minimise down-time: drop delay for up event. */
|
|
if (urgent) {
|
|
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
|
|
return;
|
|
delay = 0;
|
|
}
|
|
|
|
/* If we wrap around we'll delay it by at most HZ. */
|
|
if (delay > HZ)
|
|
delay = 0;
|
|
|
|
/*
|
|
* If urgent, schedule immediate execution; otherwise, don't
|
|
* override the existing timer.
|
|
*/
|
|
if (test_bit(LW_URGENT, &linkwatch_flags))
|
|
mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);
|
|
else
|
|
queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);
|
|
}
|
|
|
|
|
|
static void linkwatch_do_dev(struct net_device *dev)
|
|
{
|
|
/*
|
|
* Make sure the above read is complete since it can be
|
|
* rewritten as soon as we clear the bit below.
|
|
*/
|
|
smp_mb__before_atomic();
|
|
|
|
/* We are about to handle this device,
|
|
* so new events can be accepted
|
|
*/
|
|
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
|
|
|
|
rfc2863_policy(dev);
|
|
if (dev->flags & IFF_UP) {
|
|
if (netif_carrier_ok(dev))
|
|
dev_activate(dev);
|
|
else
|
|
dev_deactivate(dev);
|
|
|
|
netdev_state_change(dev);
|
|
}
|
|
/* Note: our callers are responsible for calling netdev_tracker_free().
|
|
* This is the reason we use __dev_put() instead of dev_put().
|
|
*/
|
|
__dev_put(dev);
|
|
}
|
|
|
|
static void __linkwatch_run_queue(int urgent_only)
|
|
{
|
|
#define MAX_DO_DEV_PER_LOOP 100
|
|
|
|
int do_dev = MAX_DO_DEV_PER_LOOP;
|
|
/* Use a local list here since we add non-urgent
|
|
* events back to the global one when called with
|
|
* urgent_only=1.
|
|
*/
|
|
LIST_HEAD(wrk);
|
|
|
|
/* Give urgent case more budget */
|
|
if (urgent_only)
|
|
do_dev += MAX_DO_DEV_PER_LOOP;
|
|
|
|
/*
|
|
* Limit the number of linkwatch events to one
|
|
* per second so that a runaway driver does not
|
|
* cause a storm of messages on the netlink
|
|
* socket. This limit does not apply to up events
|
|
* while the device qdisc is down.
|
|
*/
|
|
if (!urgent_only)
|
|
linkwatch_nextevent = jiffies + HZ;
|
|
/* Limit wrap-around effect on delay. */
|
|
else if (time_after(linkwatch_nextevent, jiffies + HZ))
|
|
linkwatch_nextevent = jiffies;
|
|
|
|
clear_bit(LW_URGENT, &linkwatch_flags);
|
|
|
|
spin_lock_irq(&lweventlist_lock);
|
|
list_splice_init(&lweventlist, &wrk);
|
|
|
|
while (!list_empty(&wrk) && do_dev > 0) {
|
|
struct net_device *dev;
|
|
|
|
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
|
|
list_del_init(&dev->link_watch_list);
|
|
|
|
if (!netif_device_present(dev) ||
|
|
(urgent_only && !linkwatch_urgent_event(dev))) {
|
|
list_add_tail(&dev->link_watch_list, &lweventlist);
|
|
continue;
|
|
}
|
|
/* We must free netdev tracker under
|
|
* the spinlock protection.
|
|
*/
|
|
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
|
|
spin_unlock_irq(&lweventlist_lock);
|
|
linkwatch_do_dev(dev);
|
|
do_dev--;
|
|
spin_lock_irq(&lweventlist_lock);
|
|
}
|
|
|
|
/* Add the remaining work back to lweventlist */
|
|
list_splice_init(&wrk, &lweventlist);
|
|
|
|
if (!list_empty(&lweventlist))
|
|
linkwatch_schedule_work(0);
|
|
spin_unlock_irq(&lweventlist_lock);
|
|
}
|
|
|
|
void linkwatch_sync_dev(struct net_device *dev)
|
|
{
|
|
unsigned long flags;
|
|
int clean = 0;
|
|
|
|
spin_lock_irqsave(&lweventlist_lock, flags);
|
|
if (!list_empty(&dev->link_watch_list)) {
|
|
list_del_init(&dev->link_watch_list);
|
|
clean = 1;
|
|
/* We must release netdev tracker under
|
|
* the spinlock protection.
|
|
*/
|
|
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
|
|
}
|
|
spin_unlock_irqrestore(&lweventlist_lock, flags);
|
|
if (clean)
|
|
linkwatch_do_dev(dev);
|
|
}
|
|
|
|
|
|
/* Must be called with the rtnl semaphore held */
|
|
void linkwatch_run_queue(void)
|
|
{
|
|
__linkwatch_run_queue(0);
|
|
}
|
|
|
|
|
|
static void linkwatch_event(struct work_struct *dummy)
|
|
{
|
|
rtnl_lock();
|
|
__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
|
|
rtnl_unlock();
|
|
}
|
|
|
|
|
|
void linkwatch_fire_event(struct net_device *dev)
|
|
{
|
|
bool urgent = linkwatch_urgent_event(dev);
|
|
|
|
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
|
|
linkwatch_add_event(dev);
|
|
} else if (!urgent)
|
|
return;
|
|
|
|
linkwatch_schedule_work(urgent);
|
|
}
|
|
EXPORT_SYMBOL(linkwatch_fire_event);
|