linux-stable/net/devlink/leftover.c

10841 lines
281 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/devlink.c - Network physical/parent device Netlink interface
*
* Heavily inspired by net/wireless/
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
*/
#include <linux/etherdevice.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/gfp.h>
#include <linux/device.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
#include <linux/u64_stats_sync.h>
#include <linux/timekeeping.h>
#include <rdma/ib_verbs.h>
#include <net/netlink.h>
#include <net/genetlink.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/devlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/devlink.h>
#include "devl_internal.h"
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
struct devlink_linecard {
struct list_head list;
struct devlink *devlink;
unsigned int index;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
const struct devlink_linecard_ops *ops;
void *priv;
enum devlink_linecard_state state;
struct mutex state_lock; /* Protects state */
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
const char *type;
struct devlink_linecard_type *types;
unsigned int types_count;
struct devlink *nested_devlink;
};
/**
* struct devlink_resource - devlink resource
* @name: name of the resource
* @id: id, per devlink instance
* @size: size of the resource
* @size_new: updated size of the resource, reload is needed
* @size_valid: valid in case the total size of the resource is valid
* including its children
* @parent: parent resource
* @size_params: size parameters
* @list: parent list
* @resource_list: list of child resources
* @occ_get: occupancy getter callback
* @occ_get_priv: occupancy getter callback priv
*/
struct devlink_resource {
const char *name;
u64 id;
u64 size;
u64 size_new;
bool size_valid;
struct devlink_resource *parent;
struct devlink_resource_size_params size_params;
struct list_head list;
struct list_head resource_list;
devlink_resource_occ_get_t *occ_get;
void *occ_get_priv;
};
static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
{
.name = "destination mac",
.id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
.bitwidth = 48,
},
};
struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
.name = "ethernet",
.id = DEVLINK_DPIPE_HEADER_ETHERNET,
.fields = devlink_dpipe_fields_ethernet,
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
.global = true,
};
EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet);
static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
{
.name = "destination ip",
.id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
.bitwidth = 32,
},
};
struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
.name = "ipv4",
.id = DEVLINK_DPIPE_HEADER_IPV4,
.fields = devlink_dpipe_fields_ipv4,
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
.global = true,
};
EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4);
static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
{
.name = "destination ip",
.id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
.bitwidth = 128,
},
};
struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
.name = "ipv6",
.id = DEVLINK_DPIPE_HEADER_IPV6,
.fields = devlink_dpipe_fields_ipv6,
.fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
.global = true,
};
EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
(_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
[DEVLINK_PORT_FN_ATTR_STATE] =
NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
DEVLINK_PORT_FN_STATE_ACTIVE),
[DEVLINK_PORT_FN_ATTR_CAPS] =
NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
};
#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
WARN_ON_ONCE(!(devlink_port)->registered)
#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
WARN_ON_ONCE((devlink_port)->registered)
#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \
WARN_ON_ONCE(!(devlink_port)->initialized)
static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
unsigned int port_index)
{
return xa_load(&devlink->ports, port_index);
}
static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
struct nlattr **attrs)
{
if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
struct devlink_port *devlink_port;
devlink_port = devlink_port_get_by_index(devlink, port_index);
if (!devlink_port)
return ERR_PTR(-ENODEV);
return devlink_port;
}
return ERR_PTR(-EINVAL);
}
struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
return devlink_port_get_from_attrs(devlink, info->attrs);
}
static inline bool
devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
{
return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
static inline bool
devlink_rate_is_node(struct devlink_rate *devlink_rate)
{
return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
}
static struct devlink_rate *
devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
{
struct devlink_rate *devlink_rate;
struct devlink_port *devlink_port;
devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
if (IS_ERR(devlink_port))
return ERR_CAST(devlink_port);
devlink_rate = devlink_port->devlink_rate;
return devlink_rate ?: ERR_PTR(-ENODEV);
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
static struct devlink_rate *
devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
{
static struct devlink_rate *devlink_rate;
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
if (devlink_rate_is_node(devlink_rate) &&
!strcmp(node_name, devlink_rate->name))
return devlink_rate;
}
return ERR_PTR(-ENODEV);
}
static struct devlink_rate *
devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
{
const char *rate_node_name;
size_t len;
if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
return ERR_PTR(-EINVAL);
rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
len = strlen(rate_node_name);
/* Name cannot be empty or decimal number */
if (!len || strspn(rate_node_name, "0123456789") == len)
return ERR_PTR(-EINVAL);
return devlink_rate_node_get_by_name(devlink, rate_node_name);
}
struct devlink_rate *
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
{
return devlink_rate_node_get_from_attrs(devlink, info->attrs);
}
struct devlink_rate *
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
{
struct nlattr **attrs = info->attrs;
if (attrs[DEVLINK_ATTR_PORT_INDEX])
return devlink_rate_leaf_get_from_info(devlink, info);
else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
return devlink_rate_node_get_from_info(devlink, info);
else
return ERR_PTR(-EINVAL);
}
static struct devlink_linecard *
devlink_linecard_get_by_index(struct devlink *devlink,
unsigned int linecard_index)
{
struct devlink_linecard *devlink_linecard;
list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) {
if (devlink_linecard->index == linecard_index)
return devlink_linecard;
}
return NULL;
}
static bool devlink_linecard_index_exists(struct devlink *devlink,
unsigned int linecard_index)
{
return devlink_linecard_get_by_index(devlink, linecard_index);
}
static struct devlink_linecard *
devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
{
if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) {
u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]);
struct devlink_linecard *linecard;
linecard = devlink_linecard_get_by_index(devlink, linecard_index);
if (!linecard)
return ERR_PTR(-ENODEV);
return linecard;
}
return ERR_PTR(-EINVAL);
}
struct devlink_linecard *
devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
{
return devlink_linecard_get_from_attrs(devlink, info->attrs);
}
struct devlink_sb {
struct list_head list;
unsigned int index;
u32 size;
u16 ingress_pools_count;
u16 egress_pools_count;
u16 ingress_tc_count;
u16 egress_tc_count;
};
static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
{
return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
}
static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
unsigned int sb_index)
{
struct devlink_sb *devlink_sb;
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
if (devlink_sb->index == sb_index)
return devlink_sb;
}
return NULL;
}
static bool devlink_sb_index_exists(struct devlink *devlink,
unsigned int sb_index)
{
return devlink_sb_get_by_index(devlink, sb_index);
}
static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
struct nlattr **attrs)
{
if (attrs[DEVLINK_ATTR_SB_INDEX]) {
u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
struct devlink_sb *devlink_sb;
devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
if (!devlink_sb)
return ERR_PTR(-ENODEV);
return devlink_sb;
}
return ERR_PTR(-EINVAL);
}
static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
return devlink_sb_get_from_attrs(devlink, info->attrs);
}
static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
struct nlattr **attrs,
u16 *p_pool_index)
{
u16 val;
if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
return -EINVAL;
val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
if (val >= devlink_sb_pool_count(devlink_sb))
return -EINVAL;
*p_pool_index = val;
return 0;
}
static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
struct genl_info *info,
u16 *p_pool_index)
{
return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
p_pool_index);
}
static int
devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
enum devlink_sb_pool_type *p_pool_type)
{
u8 val;
if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
return -EINVAL;
val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
val != DEVLINK_SB_POOL_TYPE_EGRESS)
return -EINVAL;
*p_pool_type = val;
return 0;
}
static int
devlink_sb_pool_type_get_from_info(struct genl_info *info,
enum devlink_sb_pool_type *p_pool_type)
{
return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
}
static int
devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
enum devlink_sb_threshold_type *p_th_type)
{
u8 val;
if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
return -EINVAL;
val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
return -EINVAL;
*p_th_type = val;
return 0;
}
static int
devlink_sb_th_type_get_from_info(struct genl_info *info,
enum devlink_sb_threshold_type *p_th_type)
{
return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
}
static int
devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
struct nlattr **attrs,
enum devlink_sb_pool_type pool_type,
u16 *p_tc_index)
{
u16 val;
if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
return -EINVAL;
val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
val >= devlink_sb->ingress_tc_count)
return -EINVAL;
if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
val >= devlink_sb->egress_tc_count)
return -EINVAL;
*p_tc_index = val;
return 0;
}
static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
u32 cap, bool is_enable)
{
caps->selector |= cap;
if (is_enable)
caps->value |= cap;
}
static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
struct devlink_port *devlink_port,
struct nla_bitfield32 *caps,
struct netlink_ext_ack *extack)
{
bool is_enable;
int err;
if (!ops->port_fn_roce_get)
return 0;
err = ops->port_fn_roce_get(devlink_port, &is_enable, extack);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
return 0;
}
devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Acked-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-06 20:51:18 +02:00
static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops,
struct devlink_port *devlink_port,
struct nla_bitfield32 *caps,
struct netlink_ext_ack *extack)
{
bool is_enable;
int err;
if (!ops->port_fn_migratable_get ||
devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
return 0;
err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
return 0;
}
static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
struct devlink_port *devlink_port,
struct sk_buff *msg,
struct netlink_ext_ack *extack,
bool *msg_updated)
{
struct nla_bitfield32 caps = {};
int err;
err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
if (err)
return err;
devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Acked-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-06 20:51:18 +02:00
err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack);
if (err)
return err;
if (!caps.selector)
return 0;
err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
caps.selector);
if (err)
return err;
*msg_updated = true;
return 0;
}
static int
devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct genl_info *info,
enum devlink_sb_pool_type pool_type,
u16 *p_tc_index)
{
return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
pool_type, p_tc_index);
}
struct devlink_region {
struct devlink *devlink;
struct devlink_port *port;
struct list_head list;
union {
const struct devlink_region_ops *ops;
const struct devlink_port_region_ops *port_ops;
};
struct mutex snapshot_lock; /* protects snapshot_list,
* max_snapshots and cur_snapshots
* consistency.
*/
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
u64 size;
};
struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
u8 *data;
u32 id;
};
static struct devlink_region *
devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
{
struct devlink_region *region;
list_for_each_entry(region, &devlink->region_list, list)
if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
}
static struct devlink_region *
devlink_port_region_get_by_name(struct devlink_port *port,
const char *region_name)
{
struct devlink_region *region;
list_for_each_entry(region, &port->region_list, list)
if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
}
static struct devlink_snapshot *
devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
{
struct devlink_snapshot *snapshot;
list_for_each_entry(snapshot, &region->snapshot_list, list)
if (snapshot->id == id)
return snapshot;
return NULL;
}
static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
{
struct nlattr *nested_attr;
nested_attr = nla_nest_start(msg, DEVLINK_ATTR_NESTED_DEVLINK);
if (!nested_attr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
nla_nest_end(msg, nested_attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, nested_attr);
return -EMSGSIZE;
}
int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
{
if (devlink_nl_put_handle(msg, devlink_port->devlink))
return -EMSGSIZE;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
return -EMSGSIZE;
return 0;
}
size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
+ nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
+ nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
}
static int devlink_nl_port_attrs_put(struct sk_buff *msg,
struct devlink_port *devlink_port)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
if (!devlink_port->attrs_set)
return 0;
if (attrs->lanes) {
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes))
return -EMSGSIZE;
}
if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable))
return -EMSGSIZE;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
switch (devlink_port->attrs.flavour) {
case DEVLINK_PORT_FLAVOUR_PCI_PF:
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
attrs->pci_pf.controller) ||
nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
return -EMSGSIZE;
if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
attrs->pci_vf.controller) ||
nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
return -EMSGSIZE;
if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
return -EMSGSIZE;
break;
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
case DEVLINK_PORT_FLAVOUR_PCI_SF:
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
attrs->pci_sf.controller) ||
nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
attrs->pci_sf.pf) ||
nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
attrs->pci_sf.sf))
return -EMSGSIZE;
break;
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
attrs->phys.port_number))
return -EMSGSIZE;
if (!attrs->split)
return 0;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
attrs->phys.port_number))
return -EMSGSIZE;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
attrs->phys.split_subport_number))
return -EMSGSIZE;
break;
default:
break;
}
return 0;
}
static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops,
struct devlink_port *port,
struct sk_buff *msg,
struct netlink_ext_ack *extack,
bool *msg_updated)
{
u8 hw_addr[MAX_ADDR_LEN];
int hw_addr_len;
int err;
if (!ops->port_function_hw_addr_get)
return 0;
err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len,
extack);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
if (err)
return err;
*msg_updated = true;
return 0;
}
static int devlink_nl_rate_fill(struct sk_buff *msg,
struct devlink_rate *devlink_rate,
enum devlink_command cmd, u32 portid, u32 seq,
int flags, struct netlink_ext_ack *extack)
{
struct devlink *devlink = devlink_rate->devlink;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
goto nla_put_failure;
if (devlink_rate_is_leaf(devlink_rate)) {
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
devlink_rate->devlink_port->index))
goto nla_put_failure;
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
} else if (devlink_rate_is_node(devlink_rate)) {
if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
devlink_rate->name))
goto nla_put_failure;
}
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE,
devlink_rate->tx_share, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX,
devlink_rate->tx_max, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
devlink_rate->tx_priority))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT,
devlink_rate->tx_weight))
goto nla_put_failure;
if (devlink_rate->parent)
if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
devlink_rate->parent->name))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
static bool
devlink_port_fn_state_valid(enum devlink_port_fn_state state)
{
return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
state == DEVLINK_PORT_FN_STATE_ACTIVE;
}
static bool
devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
{
return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
}
static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
struct devlink_port *port,
struct sk_buff *msg,
struct netlink_ext_ack *extack,
bool *msg_updated)
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
{
enum devlink_port_fn_opstate opstate;
enum devlink_port_fn_state state;
int err;
if (!ops->port_fn_state_get)
return 0;
err = ops->port_fn_state_get(port, &state, &opstate, extack);
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
if (!devlink_port_fn_state_valid(state)) {
WARN_ON_ONCE(1);
NL_SET_ERR_MSG_MOD(extack, "Invalid state read from driver");
return -EINVAL;
}
if (!devlink_port_fn_opstate_valid(opstate)) {
WARN_ON_ONCE(1);
NL_SET_ERR_MSG_MOD(extack,
"Invalid operational state read from driver");
return -EINVAL;
}
if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
return -EMSGSIZE;
*msg_updated = true;
return 0;
}
devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Acked-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-06 20:51:18 +02:00
static int
devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
return ops->port_fn_migratable_set(devlink_port, enable, extack);
}
static int
devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
return ops->port_fn_roce_set(devlink_port, enable, extack);
}
static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
struct nla_bitfield32 caps;
u32 caps_value;
int err;
caps = nla_get_bitfield32(attr);
caps_value = caps.value & caps.selector;
if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
err = devlink_port_fn_roce_set(devlink_port,
caps_value & DEVLINK_PORT_FN_CAP_ROCE,
extack);
if (err)
return err;
}
devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Acked-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-06 20:51:18 +02:00
if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
err = devlink_port_fn_mig_set(devlink_port, caps_value &
DEVLINK_PORT_FN_CAP_MIGRATABLE,
extack);
if (err)
return err;
}
return 0;
}
static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops;
struct nlattr *function_attr;
bool msg_updated = false;
int err;
function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
if (!function_attr)
return -EMSGSIZE;
ops = port->devlink->ops;
err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack,
&msg_updated);
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
if (err)
goto out;
err = devlink_port_fn_caps_fill(ops, port, msg, extack,
&msg_updated);
if (err)
goto out;
err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
out:
if (err || !msg_updated)
nla_nest_cancel(msg, function_attr);
else
nla_nest_end(msg, function_attr);
return err;
}
static int devlink_nl_port_fill(struct sk_buff *msg,
struct devlink_port *devlink_port,
enum devlink_command cmd, u32 portid, u32 seq,
int flags, struct netlink_ext_ack *extack)
{
struct devlink *devlink = devlink_port->devlink;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
spin_lock_bh(&devlink_port->type_lock);
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
goto nla_put_failure_type_locked;
if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
devlink_port->desired_type))
goto nla_put_failure_type_locked;
if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
if (devlink_port->type_eth.netdev &&
(nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
devlink_port->type_eth.ifindex) ||
nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
devlink_port->type_eth.ifname)))
goto nla_put_failure_type_locked;
}
if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
struct ib_device *ibdev = devlink_port->type_ib.ibdev;
if (ibdev &&
nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
ibdev->name))
goto nla_put_failure_type_locked;
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
spin_unlock_bh(&devlink_port->type_lock);
if (devlink_nl_port_attrs_put(msg, devlink_port))
goto nla_put_failure;
if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
goto nla_put_failure;
if (devlink_port->linecard &&
nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
devlink_port->linecard->index))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure_type_locked:
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
spin_unlock_bh(&devlink_port->type_lock);
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static void devlink_port_notify(struct devlink_port *devlink_port,
enum devlink_command cmd)
{
struct devlink *devlink = devlink_port->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static void devlink_rate_notify(struct devlink_rate *devlink_rate,
enum devlink_command cmd)
{
struct devlink *devlink = devlink_rate->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL);
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int
devlink_nl_cmd_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_rate *devlink_rate;
int idx = 0;
int err = 0;
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
u32 id = NETLINK_CB(cb->skb).portid;
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, NULL);
if (err) {
state->idx = idx;
break;
}
idx++;
}
return err;
}
const struct devlink_cmd devl_cmd_rate_get = {
.dump_one = devlink_nl_cmd_rate_get_dump_one,
};
static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *devlink_rate = info->user_ptr[1];
struct sk_buff *msg;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static bool
devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
struct devlink_rate *parent)
{
while (parent) {
if (parent == devlink_rate)
return true;
parent = parent->parent;
}
return false;
}
static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct sk_buff *msg;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int
devlink_nl_cmd_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_port *devlink_port;
unsigned long port_index;
int idx = 0;
int err = 0;
xa_for_each(&devlink->ports, port_index, devlink_port) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_port_fill(msg, devlink_port,
DEVLINK_CMD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, cb->extack);
if (err) {
state->idx = idx;
break;
}
idx++;
}
return err;
}
const struct devlink_cmd devl_cmd_port_get = {
.dump_one = devlink_nl_cmd_port_get_dump_one,
};
static int devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type port_type)
{
int err;
if (!devlink_port->devlink->ops->port_type_set)
return -EOPNOTSUPP;
if (port_type == devlink_port->type)
return 0;
err = devlink_port->devlink->ops->port_type_set(devlink_port,
port_type);
if (err)
return err;
devlink_port->desired_type = port_type;
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
}
static int devlink_port_function_hw_addr_set(struct devlink_port *port,
const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = port->devlink->ops;
const u8 *hw_addr;
int hw_addr_len;
hw_addr = nla_data(attr);
hw_addr_len = nla_len(attr);
if (hw_addr_len > MAX_ADDR_LEN) {
NL_SET_ERR_MSG_MOD(extack, "Port function hardware address too long");
return -EINVAL;
}
if (port->type == DEVLINK_PORT_TYPE_ETH) {
if (hw_addr_len != ETH_ALEN) {
NL_SET_ERR_MSG_MOD(extack, "Address must be 6 bytes for Ethernet device");
return -EINVAL;
}
if (!is_unicast_ether_addr(hw_addr)) {
NL_SET_ERR_MSG_MOD(extack, "Non-unicast hardware address unsupported");
return -EINVAL;
}
}
return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len,
extack);
}
static int devlink_port_fn_state_set(struct devlink_port *port,
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
enum devlink_port_fn_state state;
const struct devlink_ops *ops;
state = nla_get_u8(attr);
ops = port->devlink->ops;
return ops->port_fn_state_set(port, state, extack);
}
static int devlink_port_function_validate(struct devlink_port *devlink_port,
struct nlattr **tb,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
struct nlattr *attr;
if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
!ops->port_function_hw_addr_set) {
NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
"Port doesn't support function attributes");
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
return -EOPNOTSUPP;
}
if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) {
NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
"Function does not support state setting");
return -EOPNOTSUPP;
}
attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
if (attr) {
struct nla_bitfield32 caps;
caps = nla_get_bitfield32(attr);
if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
!ops->port_fn_roce_set) {
NL_SET_ERR_MSG_ATTR(extack, attr,
"Port doesn't support RoCE function attribute");
return -EOPNOTSUPP;
}
devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Acked-by: Shannon Nelson <shannon.nelson@amd.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-12-06 20:51:18 +02:00
if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
if (!ops->port_fn_migratable_set) {
NL_SET_ERR_MSG_ATTR(extack, attr,
"Port doesn't support migratable function attribute");
return -EOPNOTSUPP;
}
if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
NL_SET_ERR_MSG_ATTR(extack, attr,
"migratable function attribute supported for VFs only");
return -EOPNOTSUPP;
}
}
}
return 0;
}
static int devlink_port_function_set(struct devlink_port *port,
const struct nlattr *attr,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
int err;
err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr,
devlink_function_nl_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG_MOD(extack, "Fail to parse port function attributes");
return err;
}
err = devlink_port_function_validate(port, tb, extack);
if (err)
return err;
attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
if (attr) {
err = devlink_port_function_hw_addr_set(port, attr, extack);
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
if (err)
return err;
}
attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
if (attr) {
err = devlink_port_fn_caps_set(port, attr, extack);
if (err)
return err;
}
devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:15 -08:00
/* Keep this as the last function attribute set, so that when
* multiple port function attributes are set along with state,
* Those can be applied first before activating the state.
*/
attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
if (attr)
err = devlink_port_fn_state_set(port, attr, extack);
if (!err)
devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
return err;
}
static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
int err;
if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
enum devlink_port_type port_type;
port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
err = devlink_port_type_set(devlink_port, port_type);
if (err)
return err;
}
if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) {
struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
struct netlink_ext_ack *extack = info->extack;
err = devlink_port_function_set(devlink_port, attr, extack);
if (err)
return err;
}
return 0;
}
static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
u32 count;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
return -EINVAL;
if (!devlink->ops->port_split)
return -EOPNOTSUPP;
count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
if (!devlink_port->attrs.splittable) {
/* Split ports cannot be split. */
if (devlink_port->attrs.split)
NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split further");
else
NL_SET_ERR_MSG_MOD(info->extack, "Port cannot be split");
return -EINVAL;
}
if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) {
NL_SET_ERR_MSG_MOD(info->extack, "Invalid split count");
return -EINVAL;
}
return devlink->ops->port_split(devlink, devlink_port, count,
info->extack);
}
static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
if (!devlink->ops->port_unsplit)
return -EOPNOTSUPP;
return devlink->ops->port_unsplit(devlink, devlink_port, info->extack);
}
static int devlink_port_new_notify(struct devlink *devlink,
unsigned int port_index,
struct genl_info *info)
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
{
struct devlink_port *devlink_port;
struct sk_buff *msg;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
lockdep_assert_held(&devlink->lock);
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
devlink_port = devlink_port_get_by_index(devlink, port_index);
if (!devlink_port) {
err = -ENODEV;
goto out;
}
err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW,
info->snd_portid, info->snd_seq, 0, NULL);
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
if (err)
goto out;
return genlmsg_reply(msg, info);
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
out:
nlmsg_free(msg);
return err;
}
static int devlink_nl_cmd_port_new_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink_port_new_attrs new_attrs = {};
struct devlink *devlink = info->user_ptr[0];
unsigned int new_port_index;
int err;
if (!devlink->ops->port_new || !devlink->ops->port_del)
return -EOPNOTSUPP;
if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
!info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
NL_SET_ERR_MSG_MOD(extack, "Port flavour or PCI PF are not specified");
return -EINVAL;
}
new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
new_attrs.pfnum =
nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
/* Port index of the new port being created by driver. */
new_attrs.port_index =
nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
new_attrs.port_index_valid = true;
}
if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
new_attrs.controller =
nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
new_attrs.controller_valid = true;
}
if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
new_attrs.sfnum_valid = true;
}
err = devlink->ops->port_new(devlink, &new_attrs, extack,
&new_port_index);
if (err)
return err;
err = devlink_port_new_notify(devlink, new_port_index, info);
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
if (err && err != -ENODEV) {
/* Fail to send the response; destroy newly created port. */
devlink->ops->port_del(devlink, new_port_index, extack);
}
return err;
}
static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
unsigned int port_index;
if (!devlink->ops->port_del)
return -EOPNOTSUPP;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) {
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
NL_SET_ERR_MSG_MOD(extack, "Port index is not specified");
return -EINVAL;
}
port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
return devlink->ops->port_del(devlink, port_index, extack);
}
static int
devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
struct genl_info *info,
struct nlattr *nla_parent)
{
struct devlink *devlink = devlink_rate->devlink;
const char *parent_name = nla_data(nla_parent);
const struct devlink_ops *ops = devlink->ops;
size_t len = strlen(parent_name);
struct devlink_rate *parent;
int err = -EOPNOTSUPP;
parent = devlink_rate->parent;
if (parent && !len) {
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_parent_set(devlink_rate, NULL,
devlink_rate->priv, NULL,
info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_parent_set(devlink_rate, NULL,
devlink_rate->priv, NULL,
info->extack);
if (err)
return err;
refcount_dec(&parent->refcnt);
devlink_rate->parent = NULL;
} else if (len) {
parent = devlink_rate_node_get_by_name(devlink, parent_name);
if (IS_ERR(parent))
return -ENODEV;
if (parent == devlink_rate) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent to self is not allowed");
return -EINVAL;
}
if (devlink_rate_is_node(devlink_rate) &&
devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
NL_SET_ERR_MSG_MOD(info->extack, "Node is already a parent of parent node.");
return -EEXIST;
}
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_parent_set(devlink_rate, parent,
devlink_rate->priv, parent->priv,
info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_parent_set(devlink_rate, parent,
devlink_rate->priv, parent->priv,
info->extack);
if (err)
return err;
if (devlink_rate->parent)
/* we're reassigning to other parent in this case */
refcount_dec(&devlink_rate->parent->refcnt);
refcount_inc(&parent->refcnt);
devlink_rate->parent = parent;
}
return 0;
}
static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
const struct devlink_ops *ops,
struct genl_info *info)
{
struct nlattr *nla_parent, **attrs = info->attrs;
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
int err = -EOPNOTSUPP;
u32 priority;
u32 weight;
u64 rate;
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
if (err)
return err;
devlink_rate->tx_share = rate;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
rate, info->extack);
if (err)
return err;
devlink_rate->tx_max = rate;
}
if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) {
priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]);
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv,
priority, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv,
priority, info->extack);
if (err)
return err;
devlink_rate->tx_priority = priority;
}
if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) {
weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]);
if (devlink_rate_is_leaf(devlink_rate))
err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv,
weight, info->extack);
else if (devlink_rate_is_node(devlink_rate))
err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv,
weight, info->extack);
if (err)
return err;
devlink_rate->tx_weight = weight;
}
nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
if (nla_parent) {
err = devlink_nl_rate_parent_node_set(devlink_rate, info,
nla_parent);
if (err)
return err;
}
return 0;
}
static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
struct genl_info *info,
enum devlink_rate_type type)
{
struct nlattr **attrs = info->attrs;
if (type == DEVLINK_RATE_TYPE_LEAF) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
!ops->rate_leaf_parent_set) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
"TX priority set isn't supported for the leafs");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
"TX weight set isn't supported for the leafs");
return false;
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
} else if (type == DEVLINK_RATE_TYPE_NODE) {
if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
NL_SET_ERR_MSG_MOD(info->extack, "TX max set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
!ops->rate_node_parent_set) {
NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
"TX priority set isn't supported for the nodes");
return false;
}
if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) {
NL_SET_ERR_MSG_ATTR(info->extack,
attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
"TX weight set isn't supported for the nodes");
return false;
}
} else {
WARN(1, "Unknown type of rate object");
return false;
}
return true;
}
static int devlink_nl_cmd_rate_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *devlink_rate = info->user_ptr[1];
struct devlink *devlink = devlink_rate->devlink;
const struct devlink_ops *ops = devlink->ops;
int err;
if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
return -EOPNOTSUPP;
err = devlink_nl_rate_set(devlink_rate, ops, info);
if (!err)
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
return err;
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
static int devlink_nl_cmd_rate_new_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_rate *rate_node;
const struct devlink_ops *ops;
int err;
ops = devlink->ops;
if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
NL_SET_ERR_MSG_MOD(info->extack, "Rate nodes aren't supported");
return -EOPNOTSUPP;
}
if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
return -EOPNOTSUPP;
rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
if (!IS_ERR(rate_node))
return -EEXIST;
else if (rate_node == ERR_PTR(-EINVAL))
return -EINVAL;
rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
if (!rate_node)
return -ENOMEM;
rate_node->devlink = devlink;
rate_node->type = DEVLINK_RATE_TYPE_NODE;
rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
if (!rate_node->name) {
err = -ENOMEM;
goto err_strdup;
}
err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
if (err)
goto err_node_new;
err = devlink_nl_rate_set(rate_node, ops, info);
if (err)
goto err_rate_set;
refcount_set(&rate_node->refcnt, 1);
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
list_add(&rate_node->list, &devlink->rate_list);
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
return 0;
err_rate_set:
ops->rate_node_del(rate_node, rate_node->priv, info->extack);
err_node_new:
kfree(rate_node->name);
err_strdup:
kfree(rate_node);
return err;
}
static int devlink_nl_cmd_rate_del_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_rate *rate_node = info->user_ptr[1];
struct devlink *devlink = rate_node->devlink;
const struct devlink_ops *ops = devlink->ops;
int err;
if (refcount_read(&rate_node->refcnt) > 1) {
NL_SET_ERR_MSG_MOD(info->extack, "Node has children. Cannot delete node.");
return -EBUSY;
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
err = ops->rate_node_del(rate_node, rate_node->priv, info->extack);
if (rate_node->parent)
refcount_dec(&rate_node->parent->refcnt);
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
list_del(&rate_node->list);
kfree(rate_node->name);
kfree(rate_node);
return err;
}
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
struct devlink_linecard_type {
const char *type;
const void *priv;
};
static int devlink_nl_linecard_fill(struct sk_buff *msg,
struct devlink *devlink,
struct devlink_linecard *linecard,
enum devlink_command cmd, u32 portid,
u32 seq, int flags,
struct netlink_ext_ack *extack)
{
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
struct devlink_linecard_type *linecard_type;
struct nlattr *attr;
void *hdr;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
int i;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
goto nla_put_failure;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state))
goto nla_put_failure;
if (linecard->type &&
nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type))
goto nla_put_failure;
if (linecard->types_count) {
attr = nla_nest_start(msg,
DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES);
if (!attr)
goto nla_put_failure;
for (i = 0; i < linecard->types_count; i++) {
linecard_type = &linecard->types[i];
if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE,
linecard_type->type)) {
nla_nest_cancel(msg, attr);
goto nla_put_failure;
}
}
nla_nest_end(msg, attr);
}
if (linecard->nested_devlink &&
devlink_nl_put_nested_handle(msg, linecard->nested_devlink))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static void devlink_linecard_notify(struct devlink_linecard *linecard,
enum devlink_command cmd)
{
struct devlink *devlink = linecard->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW &&
cmd != DEVLINK_CMD_LINECARD_DEL);
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0,
NULL);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int devlink_nl_cmd_linecard_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_linecard *linecard = info->user_ptr[1];
struct devlink *devlink = linecard->devlink;
struct sk_buff *msg;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
mutex_lock(&linecard->state_lock);
err = devlink_nl_linecard_fill(msg, devlink, linecard,
DEVLINK_CMD_LINECARD_NEW,
info->snd_portid, info->snd_seq, 0,
info->extack);
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
mutex_unlock(&linecard->state_lock);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int devlink_nl_cmd_linecard_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_linecard *linecard;
int idx = 0;
int err = 0;
list_for_each_entry(linecard, &devlink->linecard_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
mutex_lock(&linecard->state_lock);
err = devlink_nl_linecard_fill(msg, devlink, linecard,
DEVLINK_CMD_LINECARD_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI,
cb->extack);
mutex_unlock(&linecard->state_lock);
if (err) {
state->idx = idx;
break;
}
idx++;
}
return err;
}
const struct devlink_cmd devl_cmd_linecard_get = {
.dump_one = devlink_nl_cmd_linecard_get_dump_one,
};
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
static struct devlink_linecard_type *
devlink_linecard_type_lookup(struct devlink_linecard *linecard,
const char *type)
{
struct devlink_linecard_type *linecard_type;
int i;
for (i = 0; i < linecard->types_count; i++) {
linecard_type = &linecard->types[i];
if (!strcmp(type, linecard_type->type))
return linecard_type;
}
return NULL;
}
static int devlink_linecard_type_set(struct devlink_linecard *linecard,
const char *type,
struct netlink_ext_ack *extack)
{
const struct devlink_linecard_ops *ops = linecard->ops;
struct devlink_linecard_type *linecard_type;
int err;
mutex_lock(&linecard->state_lock);
if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
err = -EBUSY;
goto out;
}
if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
err = -EBUSY;
goto out;
}
linecard_type = devlink_linecard_type_lookup(linecard, type);
if (!linecard_type) {
NL_SET_ERR_MSG_MOD(extack, "Unsupported line card type provided");
err = -EINVAL;
goto out;
}
if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED &&
linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
NL_SET_ERR_MSG_MOD(extack, "Line card already provisioned");
err = -EBUSY;
/* Check if the line card is provisioned in the same
* way the user asks. In case it is, make the operation
* to return success.
*/
if (ops->same_provision &&
ops->same_provision(linecard, linecard->priv,
linecard_type->type,
linecard_type->priv))
err = 0;
goto out;
}
linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING;
linecard->type = linecard_type->type;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
err = ops->provision(linecard, linecard->priv, linecard_type->type,
linecard_type->priv, extack);
if (err) {
/* Provisioning failed. Assume the linecard is unprovisioned
* for future operations.
*/
mutex_lock(&linecard->state_lock);
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
linecard->type = NULL;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
return err;
out:
mutex_unlock(&linecard->state_lock);
return err;
}
static int devlink_linecard_type_unset(struct devlink_linecard *linecard,
struct netlink_ext_ack *extack)
{
int err;
mutex_lock(&linecard->state_lock);
if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
NL_SET_ERR_MSG_MOD(extack, "Line card is currently being provisioned");
err = -EBUSY;
goto out;
}
if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
NL_SET_ERR_MSG_MOD(extack, "Line card is currently being unprovisioned");
err = -EBUSY;
goto out;
}
if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
linecard->type = NULL;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
err = 0;
goto out;
}
if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) {
NL_SET_ERR_MSG_MOD(extack, "Line card is not provisioned");
err = 0;
goto out;
}
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
err = linecard->ops->unprovision(linecard, linecard->priv,
extack);
if (err) {
/* Unprovisioning failed. Assume the linecard is unprovisioned
* for future operations.
*/
mutex_lock(&linecard->state_lock);
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
linecard->type = NULL;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
return err;
out:
mutex_unlock(&linecard->state_lock);
return err;
}
static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_linecard *linecard = info->user_ptr[1];
struct netlink_ext_ack *extack = info->extack;
int err;
if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) {
const char *type;
type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]);
if (strcmp(type, "")) {
err = devlink_linecard_type_set(linecard, type, extack);
if (err)
return err;
} else {
err = devlink_linecard_type_unset(linecard, extack);
if (err)
return err;
}
}
return 0;
}
static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_sb *devlink_sb,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
devlink_sb->ingress_pools_count))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
devlink_sb->egress_pools_count))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
devlink_sb->ingress_tc_count))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
devlink_sb->egress_tc_count))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_sb *devlink_sb;
struct sk_buff *msg;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
DEVLINK_CMD_SB_NEW,
info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int
devlink_nl_cmd_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_sb *devlink_sb;
int idx = 0;
int err = 0;
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
DEVLINK_CMD_SB_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
break;
}
idx++;
}
return err;
}
const struct devlink_cmd devl_cmd_sb_get = {
.dump_one = devlink_nl_cmd_sb_get_dump_one,
};
static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_sb *devlink_sb,
u16 pool_index, enum devlink_command cmd,
u32 portid, u32 seq, int flags)
{
struct devlink_sb_pool_info pool_info;
void *hdr;
int err;
err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
pool_index, &pool_info);
if (err)
return err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
pool_info.threshold_type))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE,
pool_info.cell_size))
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_sb *devlink_sb;
struct sk_buff *msg;
u16 pool_index;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
if (!devlink->ops->sb_pool_get)
return -EOPNOTSUPP;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
DEVLINK_CMD_SB_POOL_NEW,
info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
struct devlink *devlink,
struct devlink_sb *devlink_sb,
u32 portid, u32 seq)
{
u16 pool_count = devlink_sb_pool_count(devlink_sb);
u16 pool_index;
int err;
for (pool_index = 0; pool_index < pool_count; pool_index++) {
if (*p_idx < start) {
(*p_idx)++;
continue;
}
err = devlink_nl_sb_pool_fill(msg, devlink,
devlink_sb,
pool_index,
DEVLINK_CMD_SB_POOL_NEW,
portid, seq, NLM_F_MULTI);
if (err)
return err;
(*p_idx)++;
}
return 0;
}
static int
devlink_nl_cmd_sb_pool_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_sb *devlink_sb;
int err = 0;
int idx = 0;
if (!devlink->ops->sb_pool_get)
return 0;
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_pool_get_dumpit(msg, state->idx, &idx,
devlink, devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
if (err == -EOPNOTSUPP) {
err = 0;
} else if (err) {
state->idx = idx;
break;
}
}
return err;
}
const struct devlink_cmd devl_cmd_sb_pool_get = {
.dump_one = devlink_nl_cmd_sb_pool_get_dump_one,
};
static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
u16 pool_index, u32 size,
enum devlink_sb_threshold_type threshold_type,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink->ops;
if (ops->sb_pool_set)
return ops->sb_pool_set(devlink, sb_index, pool_index,
size, threshold_type, extack);
return -EOPNOTSUPP;
}
static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
enum devlink_sb_threshold_type threshold_type;
struct devlink_sb *devlink_sb;
u16 pool_index;
u32 size;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
err = devlink_sb_th_type_get_from_info(info, &threshold_type);
if (err)
return err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE))
return -EINVAL;
size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
return devlink_sb_pool_set(devlink, devlink_sb->index,
pool_index, size, threshold_type,
info->extack);
}
static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
struct devlink *devlink,
struct devlink_port *devlink_port,
struct devlink_sb *devlink_sb,
u16 pool_index,
enum devlink_command cmd,
u32 portid, u32 seq, int flags)
{
const struct devlink_ops *ops = devlink->ops;
u32 threshold;
void *hdr;
int err;
err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
pool_index, &threshold);
if (err)
return err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
goto nla_put_failure;
if (ops->sb_occ_port_pool_get) {
u32 cur;
u32 max;
err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
pool_index, &cur, &max);
if (err && err != -EOPNOTSUPP)
goto sb_occ_get_failure;
if (!err) {
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
goto nla_put_failure;
}
}
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
err = -EMSGSIZE;
sb_occ_get_failure:
genlmsg_cancel(msg, hdr);
return err;
}
static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = devlink_port->devlink;
struct devlink_sb *devlink_sb;
struct sk_buff *msg;
u16 pool_index;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
if (!devlink->ops->sb_port_pool_get)
return -EOPNOTSUPP;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
devlink_sb, pool_index,
DEVLINK_CMD_SB_PORT_POOL_NEW,
info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
struct devlink *devlink,
struct devlink_sb *devlink_sb,
u32 portid, u32 seq)
{
struct devlink_port *devlink_port;
u16 pool_count = devlink_sb_pool_count(devlink_sb);
unsigned long port_index;
u16 pool_index;
int err;
xa_for_each(&devlink->ports, port_index, devlink_port) {
for (pool_index = 0; pool_index < pool_count; pool_index++) {
if (*p_idx < start) {
(*p_idx)++;
continue;
}
err = devlink_nl_sb_port_pool_fill(msg, devlink,
devlink_port,
devlink_sb,
pool_index,
DEVLINK_CMD_SB_PORT_POOL_NEW,
portid, seq,
NLM_F_MULTI);
if (err)
return err;
(*p_idx)++;
}
}
return 0;
}
static int
devlink_nl_cmd_sb_port_pool_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_sb *devlink_sb;
int idx = 0;
int err = 0;
if (!devlink->ops->sb_port_pool_get)
return 0;
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_port_pool_get_dumpit(msg, state->idx, &idx,
devlink, devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
if (err == -EOPNOTSUPP) {
err = 0;
} else if (err) {
state->idx = idx;
break;
}
}
return err;
}
const struct devlink_cmd devl_cmd_sb_port_pool_get = {
.dump_one = devlink_nl_cmd_sb_port_pool_get_dump_one,
};
static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
unsigned int sb_index, u16 pool_index,
u32 threshold,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
if (ops->sb_port_pool_set)
return ops->sb_port_pool_set(devlink_port, sb_index,
pool_index, threshold, extack);
return -EOPNOTSUPP;
}
static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
struct devlink_sb *devlink_sb;
u16 pool_index;
u32 threshold;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
pool_index, threshold, info->extack);
}
static int
devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
struct devlink_port *devlink_port,
struct devlink_sb *devlink_sb, u16 tc_index,
enum devlink_sb_pool_type pool_type,
enum devlink_command cmd,
u32 portid, u32 seq, int flags)
{
const struct devlink_ops *ops = devlink->ops;
u16 pool_index;
u32 threshold;
void *hdr;
int err;
err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
tc_index, pool_type,
&pool_index, &threshold);
if (err)
return err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
goto nla_put_failure;
if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
goto nla_put_failure;
if (ops->sb_occ_tc_port_bind_get) {
u32 cur;
u32 max;
err = ops->sb_occ_tc_port_bind_get(devlink_port,
devlink_sb->index,
tc_index, pool_type,
&cur, &max);
if (err && err != -EOPNOTSUPP)
return err;
if (!err) {
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
goto nla_put_failure;
}
}
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = devlink_port->devlink;
struct devlink_sb *devlink_sb;
struct sk_buff *msg;
enum devlink_sb_pool_type pool_type;
u16 tc_index;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_type_get_from_info(info, &pool_type);
if (err)
return err;
err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
pool_type, &tc_index);
if (err)
return err;
if (!devlink->ops->sb_tc_pool_bind_get)
return -EOPNOTSUPP;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
devlink_sb, tc_index, pool_type,
DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
info->snd_portid,
info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
int start, int *p_idx,
struct devlink *devlink,
struct devlink_sb *devlink_sb,
u32 portid, u32 seq)
{
struct devlink_port *devlink_port;
unsigned long port_index;
u16 tc_index;
int err;
xa_for_each(&devlink->ports, port_index, devlink_port) {
for (tc_index = 0;
tc_index < devlink_sb->ingress_tc_count; tc_index++) {
if (*p_idx < start) {
(*p_idx)++;
continue;
}
err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
devlink_port,
devlink_sb,
tc_index,
DEVLINK_SB_POOL_TYPE_INGRESS,
DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
portid, seq,
NLM_F_MULTI);
if (err)
return err;
(*p_idx)++;
}
for (tc_index = 0;
tc_index < devlink_sb->egress_tc_count; tc_index++) {
if (*p_idx < start) {
(*p_idx)++;
continue;
}
err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
devlink_port,
devlink_sb,
tc_index,
DEVLINK_SB_POOL_TYPE_EGRESS,
DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
portid, seq,
NLM_F_MULTI);
if (err)
return err;
(*p_idx)++;
}
}
return 0;
}
static int
devlink_nl_cmd_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_sb *devlink_sb;
int idx = 0;
int err = 0;
if (!devlink->ops->sb_tc_pool_bind_get)
return 0;
list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx,
devlink, devlink_sb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq);
if (err == -EOPNOTSUPP) {
err = 0;
} else if (err) {
state->idx = idx;
break;
}
}
return err;
}
const struct devlink_cmd devl_cmd_sb_tc_pool_bind_get = {
.dump_one = devlink_nl_cmd_sb_tc_pool_bind_get_dump_one,
};
static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
unsigned int sb_index, u16 tc_index,
enum devlink_sb_pool_type pool_type,
u16 pool_index, u32 threshold,
struct netlink_ext_ack *extack)
{
const struct devlink_ops *ops = devlink_port->devlink->ops;
if (ops->sb_tc_pool_bind_set)
return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
tc_index, pool_type,
pool_index, threshold, extack);
return -EOPNOTSUPP;
}
static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_port *devlink_port = info->user_ptr[1];
struct devlink *devlink = info->user_ptr[0];
enum devlink_sb_pool_type pool_type;
struct devlink_sb *devlink_sb;
u16 tc_index;
u16 pool_index;
u32 threshold;
int err;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
err = devlink_sb_pool_type_get_from_info(info, &pool_type);
if (err)
return err;
err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
pool_type, &tc_index);
if (err)
return err;
err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
&pool_index);
if (err)
return err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
tc_index, pool_type,
pool_index, threshold, info->extack);
}
static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct devlink_sb *devlink_sb;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
if (ops->sb_occ_snapshot)
return ops->sb_occ_snapshot(devlink, devlink_sb->index);
return -EOPNOTSUPP;
}
static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
struct devlink_sb *devlink_sb;
devlink_sb = devlink_sb_get_from_info(devlink, info);
if (IS_ERR(devlink_sb))
return PTR_ERR(devlink_sb);
if (ops->sb_occ_max_clear)
return ops->sb_occ_max_clear(devlink, devlink_sb->index);
return -EOPNOTSUPP;
}
int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
struct netlink_ext_ack *extack)
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
{
struct devlink_rate *devlink_rate;
list_for_each_entry(devlink_rate, &devlink->rate_list, list)
if (devlink_rate_is_node(devlink_rate)) {
NL_SET_ERR_MSG_MOD(extack, "Rate node(s) exists.");
return -EBUSY;
}
return 0;
}
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
int devlink_dpipe_match_put(struct sk_buff *skb,
struct devlink_dpipe_match *match)
{
struct devlink_dpipe_header *header = match->header;
struct devlink_dpipe_field *field = &header->fields[match->field_id];
struct nlattr *match_attr;
match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!match_attr)
return -EMSGSIZE;
if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
goto nla_put_failure;
nla_nest_end(skb, match_attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, match_attr);
return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_match_put);
static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table,
struct sk_buff *skb)
{
struct nlattr *matches_attr;
matches_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_TABLE_MATCHES);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!matches_attr)
return -EMSGSIZE;
if (table->table_ops->matches_dump(table->priv, skb))
goto nla_put_failure;
nla_nest_end(skb, matches_attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, matches_attr);
return -EMSGSIZE;
}
int devlink_dpipe_action_put(struct sk_buff *skb,
struct devlink_dpipe_action *action)
{
struct devlink_dpipe_header *header = action->header;
struct devlink_dpipe_field *field = &header->fields[action->field_id];
struct nlattr *action_attr;
action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!action_attr)
return -EMSGSIZE;
if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
goto nla_put_failure;
nla_nest_end(skb, action_attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, action_attr);
return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_action_put);
static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table,
struct sk_buff *skb)
{
struct nlattr *actions_attr;
actions_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_TABLE_ACTIONS);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!actions_attr)
return -EMSGSIZE;
if (table->table_ops->actions_dump(table->priv, skb))
goto nla_put_failure;
nla_nest_end(skb, actions_attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, actions_attr);
return -EMSGSIZE;
}
static int devlink_dpipe_table_put(struct sk_buff *skb,
struct devlink_dpipe_table *table)
{
struct nlattr *table_attr;
u64 table_size;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
table_size = table->table_ops->size_get(table->priv);
table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!table_attr)
return -EMSGSIZE;
if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size,
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
table->counters_enabled))
goto nla_put_failure;
if (table->resource_valid) {
if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
table->resource_id, DEVLINK_ATTR_PAD) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
table->resource_units, DEVLINK_ATTR_PAD))
goto nla_put_failure;
}
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (devlink_dpipe_matches_put(table, skb))
goto nla_put_failure;
if (devlink_dpipe_actions_put(table, skb))
goto nla_put_failure;
nla_nest_end(skb, table_attr);
return 0;
nla_put_failure:
nla_nest_cancel(skb, table_attr);
return -EMSGSIZE;
}
static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb,
struct genl_info *info)
{
int err;
if (*pskb) {
err = genlmsg_reply(*pskb, info);
if (err)
return err;
}
*pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!*pskb)
return -ENOMEM;
return 0;
}
static int devlink_dpipe_tables_fill(struct genl_info *info,
enum devlink_command cmd, int flags,
struct list_head *dpipe_tables,
const char *table_name)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_dpipe_table *table;
struct nlattr *tables_attr;
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
bool incomplete;
void *hdr;
int i;
int err;
table = list_first_entry(dpipe_tables,
struct devlink_dpipe_table, list);
start_again:
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
if (!hdr) {
nlmsg_free(skb);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
return -EMSGSIZE;
}
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!tables_attr)
goto nla_put_failure;
i = 0;
incomplete = false;
list_for_each_entry_from(table, dpipe_tables, list) {
if (!table_name) {
err = devlink_dpipe_table_put(skb, table);
if (err) {
if (!i)
goto err_table_put;
incomplete = true;
break;
}
} else {
if (!strcmp(table->name, table_name)) {
err = devlink_dpipe_table_put(skb, table);
if (err)
break;
}
}
i++;
}
nla_nest_end(skb, tables_attr);
genlmsg_end(skb, hdr);
if (incomplete)
goto start_again;
send_done:
nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
NLMSG_DONE, 0, flags | NLM_F_MULTI);
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
goto send_done;
}
return genlmsg_reply(skb, info);
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
nlmsg_free(skb);
return err;
}
static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const char *table_name = NULL;
if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0,
&devlink->dpipe_table_list,
table_name);
}
static int devlink_dpipe_value_put(struct sk_buff *skb,
struct devlink_dpipe_value *value)
{
if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE,
value->value_size, value->value))
return -EMSGSIZE;
if (value->mask)
if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK,
value->value_size, value->mask))
return -EMSGSIZE;
if (value->mapping_valid)
if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING,
value->mapping_value))
return -EMSGSIZE;
return 0;
}
static int devlink_dpipe_action_value_put(struct sk_buff *skb,
struct devlink_dpipe_value *value)
{
if (!value->action)
return -EINVAL;
if (devlink_dpipe_action_put(skb, value->action))
return -EMSGSIZE;
if (devlink_dpipe_value_put(skb, value))
return -EMSGSIZE;
return 0;
}
static int devlink_dpipe_action_values_put(struct sk_buff *skb,
struct devlink_dpipe_value *values,
unsigned int values_count)
{
struct nlattr *action_attr;
int i;
int err;
for (i = 0; i < values_count; i++) {
action_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_ACTION_VALUE);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!action_attr)
return -EMSGSIZE;
err = devlink_dpipe_action_value_put(skb, &values[i]);
if (err)
goto err_action_value_put;
nla_nest_end(skb, action_attr);
}
return 0;
err_action_value_put:
nla_nest_cancel(skb, action_attr);
return err;
}
static int devlink_dpipe_match_value_put(struct sk_buff *skb,
struct devlink_dpipe_value *value)
{
if (!value->match)
return -EINVAL;
if (devlink_dpipe_match_put(skb, value->match))
return -EMSGSIZE;
if (devlink_dpipe_value_put(skb, value))
return -EMSGSIZE;
return 0;
}
static int devlink_dpipe_match_values_put(struct sk_buff *skb,
struct devlink_dpipe_value *values,
unsigned int values_count)
{
struct nlattr *match_attr;
int i;
int err;
for (i = 0; i < values_count; i++) {
match_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_MATCH_VALUE);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!match_attr)
return -EMSGSIZE;
err = devlink_dpipe_match_value_put(skb, &values[i]);
if (err)
goto err_match_value_put;
nla_nest_end(skb, match_attr);
}
return 0;
err_match_value_put:
nla_nest_cancel(skb, match_attr);
return err;
}
static int devlink_dpipe_entry_put(struct sk_buff *skb,
struct devlink_dpipe_entry *entry)
{
struct nlattr *entry_attr, *matches_attr, *actions_attr;
int err;
entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!entry_attr)
return -EMSGSIZE;
if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index,
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (entry->counter_valid)
if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER,
entry->counter, DEVLINK_ATTR_PAD))
goto nla_put_failure;
matches_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!matches_attr)
goto nla_put_failure;
err = devlink_dpipe_match_values_put(skb, entry->match_values,
entry->match_values_count);
if (err) {
nla_nest_cancel(skb, matches_attr);
goto err_match_values_put;
}
nla_nest_end(skb, matches_attr);
actions_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!actions_attr)
goto nla_put_failure;
err = devlink_dpipe_action_values_put(skb, entry->action_values,
entry->action_values_count);
if (err) {
nla_nest_cancel(skb, actions_attr);
goto err_action_values_put;
}
nla_nest_end(skb, actions_attr);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
nla_nest_end(skb, entry_attr);
return 0;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
nla_put_failure:
err = -EMSGSIZE;
err_match_values_put:
err_action_values_put:
nla_nest_cancel(skb, entry_attr);
return err;
}
static struct devlink_dpipe_table *
devlink_dpipe_table_find(struct list_head *dpipe_tables,
const char *table_name, struct devlink *devlink)
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
{
struct devlink_dpipe_table *table;
list_for_each_entry_rcu(table, dpipe_tables, list,
lockdep_is_held(&devlink->lock)) {
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!strcmp(table->name, table_name))
return table;
}
return NULL;
}
int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
{
struct devlink *devlink;
int err;
err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb,
dump_ctx->info);
if (err)
return err;
dump_ctx->hdr = genlmsg_put(dump_ctx->skb,
dump_ctx->info->snd_portid,
dump_ctx->info->snd_seq,
&devlink_nl_family, NLM_F_MULTI,
dump_ctx->cmd);
if (!dump_ctx->hdr)
goto nla_put_failure;
devlink = dump_ctx->info->user_ptr[0];
if (devlink_nl_put_handle(dump_ctx->skb, devlink))
goto nla_put_failure;
dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb,
DEVLINK_ATTR_DPIPE_ENTRIES);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!dump_ctx->nest)
goto nla_put_failure;
return 0;
nla_put_failure:
nlmsg_free(dump_ctx->skb);
return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare);
int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
struct devlink_dpipe_entry *entry)
{
return devlink_dpipe_entry_put(dump_ctx->skb, entry);
}
EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append);
int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
{
nla_nest_end(dump_ctx->skb, dump_ctx->nest);
genlmsg_end(dump_ctx->skb, dump_ctx->hdr);
return 0;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
{
unsigned int value_count, value_index;
struct devlink_dpipe_value *value;
value = entry->action_values;
value_count = entry->action_values_count;
for (value_index = 0; value_index < value_count; value_index++) {
kfree(value[value_index].value);
kfree(value[value_index].mask);
}
value = entry->match_values;
value_count = entry->match_values_count;
for (value_index = 0; value_index < value_count; value_index++) {
kfree(value[value_index].value);
kfree(value[value_index].mask);
}
}
EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
static int devlink_dpipe_entries_fill(struct genl_info *info,
enum devlink_command cmd, int flags,
struct devlink_dpipe_table *table)
{
struct devlink_dpipe_dump_ctx dump_ctx;
struct nlmsghdr *nlh;
int err;
dump_ctx.skb = NULL;
dump_ctx.cmd = cmd;
dump_ctx.info = info;
err = table->table_ops->entries_dump(table->priv,
table->counters_enabled,
&dump_ctx);
if (err)
return err;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
send_done:
nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
NLMSG_DONE, 0, flags | NLM_F_MULTI);
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
if (err)
return err;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
goto send_done;
}
return genlmsg_reply(dump_ctx.skb, info);
}
static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_dpipe_table *table;
const char *table_name;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME))
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!table)
return -EINVAL;
if (!table->table_ops->entries_dump)
return -EINVAL;
return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET,
0, table);
}
static int devlink_dpipe_fields_put(struct sk_buff *skb,
const struct devlink_dpipe_header *header)
{
struct devlink_dpipe_field *field;
struct nlattr *field_attr;
int i;
for (i = 0; i < header->fields_count; i++) {
field = &header->fields[i];
field_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_FIELD);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!field_attr)
return -EMSGSIZE;
if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type))
goto nla_put_failure;
nla_nest_end(skb, field_attr);
}
return 0;
nla_put_failure:
nla_nest_cancel(skb, field_attr);
return -EMSGSIZE;
}
static int devlink_dpipe_header_put(struct sk_buff *skb,
struct devlink_dpipe_header *header)
{
struct nlattr *fields_attr, *header_attr;
int err;
header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER);
if (!header_attr)
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
return -EMSGSIZE;
if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) ||
nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
goto nla_put_failure;
fields_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_DPIPE_HEADER_FIELDS);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!fields_attr)
goto nla_put_failure;
err = devlink_dpipe_fields_put(skb, header);
if (err) {
nla_nest_cancel(skb, fields_attr);
goto nla_put_failure;
}
nla_nest_end(skb, fields_attr);
nla_nest_end(skb, header_attr);
return 0;
nla_put_failure:
err = -EMSGSIZE;
nla_nest_cancel(skb, header_attr);
return err;
}
static int devlink_dpipe_headers_fill(struct genl_info *info,
enum devlink_command cmd, int flags,
struct devlink_dpipe_headers *
dpipe_headers)
{
struct devlink *devlink = info->user_ptr[0];
struct nlattr *headers_attr;
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
void *hdr;
int i, j;
int err;
i = 0;
start_again:
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
if (!hdr) {
nlmsg_free(skb);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
return -EMSGSIZE;
}
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!headers_attr)
goto nla_put_failure;
j = 0;
for (; i < dpipe_headers->headers_count; i++) {
err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]);
if (err) {
if (!j)
goto err_table_put;
break;
}
j++;
}
nla_nest_end(skb, headers_attr);
genlmsg_end(skb, hdr);
if (i != dpipe_headers->headers_count)
goto start_again;
send_done:
nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
NLMSG_DONE, 0, flags | NLM_F_MULTI);
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
goto send_done;
}
return genlmsg_reply(skb, info);
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
nlmsg_free(skb);
return err;
}
static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
if (!devlink->dpipe_headers)
return -EOPNOTSUPP;
return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET,
0, devlink->dpipe_headers);
}
static int devlink_dpipe_table_counters_set(struct devlink *devlink,
const char *table_name,
bool enable)
{
struct devlink_dpipe_table *table;
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!table)
return -EINVAL;
if (table->counter_control_extern)
return -EOPNOTSUPP;
if (!(table->counters_enabled ^ enable))
return 0;
table->counters_enabled = enable;
if (table->table_ops->counters_set_update)
table->table_ops->counters_set_update(table->priv, enable);
return 0;
}
static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
const char *table_name;
bool counters_enable;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) ||
GENL_REQ_ATTR_CHECK(info,
DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED))
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
return devlink_dpipe_table_counters_set(devlink, table_name,
counters_enable);
}
static struct devlink_resource *
devlink_resource_find(struct devlink *devlink,
struct devlink_resource *resource, u64 resource_id)
{
struct list_head *resource_list;
if (resource)
resource_list = &resource->resource_list;
else
resource_list = &devlink->resource_list;
list_for_each_entry(resource, resource_list, list) {
struct devlink_resource *child_resource;
if (resource->id == resource_id)
return resource;
child_resource = devlink_resource_find(devlink, resource,
resource_id);
if (child_resource)
return child_resource;
}
return NULL;
}
static void
devlink_resource_validate_children(struct devlink_resource *resource)
{
struct devlink_resource *child_resource;
bool size_valid = true;
u64 parts_size = 0;
if (list_empty(&resource->resource_list))
goto out;
list_for_each_entry(child_resource, &resource->resource_list, list)
parts_size += child_resource->size_new;
if (parts_size > resource->size_new)
size_valid = false;
out:
resource->size_valid = size_valid;
}
static int
devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
struct netlink_ext_ack *extack)
{
u64 reminder;
int err = 0;
if (size > resource->size_params.size_max) {
NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
err = -EINVAL;
}
if (size < resource->size_params.size_min) {
NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
err = -EINVAL;
}
div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
if (reminder) {
NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
err = -EINVAL;
}
return err;
}
static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_resource *resource;
u64 resource_id;
u64 size;
int err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) ||
GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE))
return -EINVAL;
resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (!resource)
return -EINVAL;
size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
err = devlink_resource_validate_size(resource, size, info->extack);
if (err)
return err;
resource->size_new = size;
devlink_resource_validate_children(resource);
if (resource->parent)
devlink_resource_validate_children(resource->parent);
return 0;
}
static int
devlink_resource_size_params_put(struct devlink_resource *resource,
struct sk_buff *skb)
{
struct devlink_resource_size_params *size_params;
size_params = &resource->size_params;
if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
size_params->size_granularity, DEVLINK_ATTR_PAD) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
size_params->size_max, DEVLINK_ATTR_PAD) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
size_params->size_min, DEVLINK_ATTR_PAD) ||
nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
return -EMSGSIZE;
return 0;
}
static int devlink_resource_occ_put(struct devlink_resource *resource,
struct sk_buff *skb)
{
if (!resource->occ_get)
return 0;
return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
resource->occ_get(resource->occ_get_priv),
DEVLINK_ATTR_PAD);
}
static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
struct devlink_resource *resource)
{
struct devlink_resource *child_resource;
struct nlattr *child_resource_attr;
struct nlattr *resource_attr;
resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE);
if (!resource_attr)
return -EMSGSIZE;
if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
DEVLINK_ATTR_PAD) ||
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (resource->size != resource->size_new &&
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
resource->size_new, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (devlink_resource_occ_put(resource, skb))
goto nla_put_failure;
if (devlink_resource_size_params_put(resource, skb))
goto nla_put_failure;
if (list_empty(&resource->resource_list))
goto out;
if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
resource->size_valid))
goto nla_put_failure;
child_resource_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_RESOURCE_LIST);
if (!child_resource_attr)
goto nla_put_failure;
list_for_each_entry(child_resource, &resource->resource_list, list) {
if (devlink_resource_put(devlink, skb, child_resource))
goto resource_put_failure;
}
nla_nest_end(skb, child_resource_attr);
out:
nla_nest_end(skb, resource_attr);
return 0;
resource_put_failure:
nla_nest_cancel(skb, child_resource_attr);
nla_put_failure:
nla_nest_cancel(skb, resource_attr);
return -EMSGSIZE;
}
static int devlink_resource_fill(struct genl_info *info,
enum devlink_command cmd, int flags)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_resource *resource;
struct nlattr *resources_attr;
struct sk_buff *skb = NULL;
struct nlmsghdr *nlh;
bool incomplete;
void *hdr;
int i;
int err;
resource = list_first_entry(&devlink->resource_list,
struct devlink_resource, list);
start_again:
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, NLM_F_MULTI, cmd);
if (!hdr) {
nlmsg_free(skb);
return -EMSGSIZE;
}
if (devlink_nl_put_handle(skb, devlink))
goto nla_put_failure;
resources_attr = nla_nest_start_noflag(skb,
DEVLINK_ATTR_RESOURCE_LIST);
if (!resources_attr)
goto nla_put_failure;
incomplete = false;
i = 0;
list_for_each_entry_from(resource, &devlink->resource_list, list) {
err = devlink_resource_put(devlink, skb, resource);
if (err) {
if (!i)
goto err_resource_put;
incomplete = true;
break;
}
i++;
}
nla_nest_end(skb, resources_attr);
genlmsg_end(skb, hdr);
if (incomplete)
goto start_again;
send_done:
nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
NLMSG_DONE, 0, flags | NLM_F_MULTI);
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
return err;
goto send_done;
}
return genlmsg_reply(skb, info);
nla_put_failure:
err = -EMSGSIZE;
err_resource_put:
nlmsg_free(skb);
return err;
}
static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
if (list_empty(&devlink->resource_list))
return -EOPNOTSUPP;
return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
}
int devlink_resources_validate(struct devlink *devlink,
struct devlink_resource *resource,
struct genl_info *info)
{
struct list_head *resource_list;
int err = 0;
if (resource)
resource_list = &resource->resource_list;
else
resource_list = &devlink->resource_list;
list_for_each_entry(resource, resource_list, list) {
if (!resource->size_valid)
return -EINVAL;
err = devlink_resources_validate(devlink, resource, info);
if (err)
return err;
}
return err;
}
static const struct devlink_param devlink_param_generic[] = {
{
.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
.name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
.type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
.name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
.type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
.name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
.name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
.type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
.name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
.type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
.name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
.type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
.name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
.type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
.name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
.type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
.name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
.name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
.name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
.name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
.name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
.name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME,
.type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
.name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME,
.type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE,
},
{
.id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
.name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME,
.type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE,
},
};
static int devlink_param_generic_verify(const struct devlink_param *param)
{
/* verify it match generic parameter by id and name */
if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
return -EINVAL;
if (strcmp(param->name, devlink_param_generic[param->id].name))
return -ENOENT;
WARN_ON(param->type != devlink_param_generic[param->id].type);
return 0;
}
static int devlink_param_driver_verify(const struct devlink_param *param)
{
int i;
if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
return -EINVAL;
/* verify no such name in generic params */
for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
if (!strcmp(param->name, devlink_param_generic[i].name))
return -EEXIST;
return 0;
}
static struct devlink_param_item *
devlink_param_find_by_name(struct list_head *param_list,
const char *param_name)
{
struct devlink_param_item *param_item;
list_for_each_entry(param_item, param_list, list)
if (!strcmp(param_item->param->name, param_name))
return param_item;
return NULL;
}
static struct devlink_param_item *
devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
{
struct devlink_param_item *param_item;
list_for_each_entry(param_item, param_list, list)
if (param_item->param->id == param_id)
return param_item;
return NULL;
}
static bool
devlink_param_cmode_is_supported(const struct devlink_param *param,
enum devlink_param_cmode cmode)
{
return test_bit(cmode, &param->supported_cmodes);
}
static int devlink_param_get(struct devlink *devlink,
const struct devlink_param *param,
struct devlink_param_gset_ctx *ctx)
{
devlink: Fix use-after-free after a failed reload After a failed devlink reload, devlink parameters are still registered, which means user space can set and get their values. In the case of the mlxsw "acl_region_rehash_interval" parameter, these operations will trigger a use-after-free [1]. Fix this by rejecting set and get operations while in the failed state. Return the "-EOPNOTSUPP" error code which does not abort the parameters dump, but instead causes it to skip over the problematic parameter. Another possible fix is to perform these checks in the mlxsw parameter callbacks, but other drivers might be affected by the same problem and I am not aware of scenarios where these stricter checks will cause a regression. [1] mlxsw_spectrum3 0000:00:10.0: Port 125: Failed to register netdev mlxsw_spectrum3 0000:00:10.0: Failed to create ports ================================================================== BUG: KASAN: use-after-free in mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 Read of size 4 at addr ffff8880099dcfd8 by task kworker/u4:4/777 CPU: 1 PID: 777 Comm: kworker/u4:4 Not tainted 5.19.0-rc7-custom-126601-gfe26f28c586d #1 Hardware name: QEMU MSN4700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x92/0xbd lib/dump_stack.c:106 print_address_description mm/kasan/report.c:313 [inline] print_report.cold+0x5e/0x5cf mm/kasan/report.c:429 kasan_report+0xb9/0xf0 mm/kasan/report.c:491 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report_generic.c:306 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 mlxsw_sp_acl_region_rehash_intrvl_get+0x49/0x60 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c:1106 mlxsw_sp_params_acl_region_rehash_intrvl_get+0x33/0x80 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3854 devlink_param_get net/core/devlink.c:4981 [inline] devlink_nl_param_fill+0x238/0x12d0 net/core/devlink.c:5089 devlink_param_notify+0xe5/0x230 net/core/devlink.c:5168 devlink_ns_change_notify net/core/devlink.c:4417 [inline] devlink_ns_change_notify net/core/devlink.c:4396 [inline] devlink_reload+0x15f/0x700 net/core/devlink.c:4507 devlink_pernet_pre_exit+0x112/0x1d0 net/core/devlink.c:12272 ops_pre_exit_list net/core/net_namespace.c:152 [inline] cleanup_net+0x494/0xc00 net/core/net_namespace.c:582 process_one_work+0x9fc/0x1710 kernel/workqueue.c:2289 worker_thread+0x675/0x10b0 kernel/workqueue.c:2436 kthread+0x30c/0x3d0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 </TASK> The buggy address belongs to the physical page: page:ffffea0000267700 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99dc flags: 0x100000000000000(node=0|zone=1) raw: 0100000000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880099dce80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dcf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff8880099dcf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8880099dd000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dd080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param") Signed-off-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-08-09 14:35:06 +03:00
if (!param->get || devlink->reload_failed)
return -EOPNOTSUPP;
return param->get(devlink, param->id, ctx);
}
static int devlink_param_set(struct devlink *devlink,
const struct devlink_param *param,
struct devlink_param_gset_ctx *ctx)
{
devlink: Fix use-after-free after a failed reload After a failed devlink reload, devlink parameters are still registered, which means user space can set and get their values. In the case of the mlxsw "acl_region_rehash_interval" parameter, these operations will trigger a use-after-free [1]. Fix this by rejecting set and get operations while in the failed state. Return the "-EOPNOTSUPP" error code which does not abort the parameters dump, but instead causes it to skip over the problematic parameter. Another possible fix is to perform these checks in the mlxsw parameter callbacks, but other drivers might be affected by the same problem and I am not aware of scenarios where these stricter checks will cause a regression. [1] mlxsw_spectrum3 0000:00:10.0: Port 125: Failed to register netdev mlxsw_spectrum3 0000:00:10.0: Failed to create ports ================================================================== BUG: KASAN: use-after-free in mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 Read of size 4 at addr ffff8880099dcfd8 by task kworker/u4:4/777 CPU: 1 PID: 777 Comm: kworker/u4:4 Not tainted 5.19.0-rc7-custom-126601-gfe26f28c586d #1 Hardware name: QEMU MSN4700, BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: netns cleanup_net Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x92/0xbd lib/dump_stack.c:106 print_address_description mm/kasan/report.c:313 [inline] print_report.cold+0x5e/0x5cf mm/kasan/report.c:429 kasan_report+0xb9/0xf0 mm/kasan/report.c:491 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report_generic.c:306 mlxsw_sp_acl_tcam_vregion_rehash_intrvl_get+0xbd/0xd0 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c:904 mlxsw_sp_acl_region_rehash_intrvl_get+0x49/0x60 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c:1106 mlxsw_sp_params_acl_region_rehash_intrvl_get+0x33/0x80 drivers/net/ethernet/mellanox/mlxsw/spectrum.c:3854 devlink_param_get net/core/devlink.c:4981 [inline] devlink_nl_param_fill+0x238/0x12d0 net/core/devlink.c:5089 devlink_param_notify+0xe5/0x230 net/core/devlink.c:5168 devlink_ns_change_notify net/core/devlink.c:4417 [inline] devlink_ns_change_notify net/core/devlink.c:4396 [inline] devlink_reload+0x15f/0x700 net/core/devlink.c:4507 devlink_pernet_pre_exit+0x112/0x1d0 net/core/devlink.c:12272 ops_pre_exit_list net/core/net_namespace.c:152 [inline] cleanup_net+0x494/0xc00 net/core/net_namespace.c:582 process_one_work+0x9fc/0x1710 kernel/workqueue.c:2289 worker_thread+0x675/0x10b0 kernel/workqueue.c:2436 kthread+0x30c/0x3d0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 </TASK> The buggy address belongs to the physical page: page:ffffea0000267700 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99dc flags: 0x100000000000000(node=0|zone=1) raw: 0100000000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880099dce80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dcf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff8880099dcf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff8880099dd000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff8880099dd080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Fixes: 98bbf70c1c41 ("mlxsw: spectrum: add "acl_region_rehash_interval" devlink param") Signed-off-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-08-09 14:35:06 +03:00
if (!param->set || devlink->reload_failed)
return -EOPNOTSUPP;
return param->set(devlink, param->id, ctx);
}
static int
devlink_param_type_to_nla_type(enum devlink_param_type param_type)
{
switch (param_type) {
case DEVLINK_PARAM_TYPE_U8:
return NLA_U8;
case DEVLINK_PARAM_TYPE_U16:
return NLA_U16;
case DEVLINK_PARAM_TYPE_U32:
return NLA_U32;
case DEVLINK_PARAM_TYPE_STRING:
return NLA_STRING;
case DEVLINK_PARAM_TYPE_BOOL:
return NLA_FLAG;
default:
return -EINVAL;
}
}
static int
devlink_nl_param_value_fill_one(struct sk_buff *msg,
enum devlink_param_type type,
enum devlink_param_cmode cmode,
union devlink_param_value val)
{
struct nlattr *param_value_attr;
param_value_attr = nla_nest_start_noflag(msg,
DEVLINK_ATTR_PARAM_VALUE);
if (!param_value_attr)
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
goto value_nest_cancel;
switch (type) {
case DEVLINK_PARAM_TYPE_U8:
if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
goto value_nest_cancel;
break;
case DEVLINK_PARAM_TYPE_U16:
if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
goto value_nest_cancel;
break;
case DEVLINK_PARAM_TYPE_U32:
if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
goto value_nest_cancel;
break;
case DEVLINK_PARAM_TYPE_STRING:
if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
val.vstr))
goto value_nest_cancel;
break;
case DEVLINK_PARAM_TYPE_BOOL:
if (val.vbool &&
nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
goto value_nest_cancel;
break;
}
nla_nest_end(msg, param_value_attr);
return 0;
value_nest_cancel:
nla_nest_cancel(msg, param_value_attr);
nla_put_failure:
return -EMSGSIZE;
}
static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
unsigned int port_index,
struct devlink_param_item *param_item,
enum devlink_command cmd,
u32 portid, u32 seq, int flags)
{
union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
const struct devlink_param *param = param_item->param;
struct devlink_param_gset_ctx ctx;
struct nlattr *param_values_list;
struct nlattr *param_attr;
int nla_type;
void *hdr;
int err;
int i;
/* Get value from driver part to driverinit configuration mode */
for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
if (!devlink_param_cmode_is_supported(param, i))
continue;
if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
if (!param_item->driverinit_value_valid)
return -EOPNOTSUPP;
param_value[i] = param_item->driverinit_value;
} else {
ctx.cmode = i;
err = devlink_param_get(devlink, param, &ctx);
if (err)
return err;
param_value[i] = ctx.val;
}
param_value_set[i] = true;
}
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto genlmsg_cancel;
if (cmd == DEVLINK_CMD_PORT_PARAM_GET ||
cmd == DEVLINK_CMD_PORT_PARAM_NEW ||
cmd == DEVLINK_CMD_PORT_PARAM_DEL)
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index))
goto genlmsg_cancel;
param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM);
if (!param_attr)
goto genlmsg_cancel;
if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
goto param_nest_cancel;
if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
goto param_nest_cancel;
nla_type = devlink_param_type_to_nla_type(param->type);
if (nla_type < 0)
goto param_nest_cancel;
if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
goto param_nest_cancel;
param_values_list = nla_nest_start_noflag(msg,
DEVLINK_ATTR_PARAM_VALUES_LIST);
if (!param_values_list)
goto param_nest_cancel;
for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
if (!param_value_set[i])
continue;
err = devlink_nl_param_value_fill_one(msg, param->type,
i, param_value[i]);
if (err)
goto values_list_nest_cancel;
}
nla_nest_end(msg, param_values_list);
nla_nest_end(msg, param_attr);
genlmsg_end(msg, hdr);
return 0;
values_list_nest_cancel:
nla_nest_end(msg, param_values_list);
param_nest_cancel:
nla_nest_cancel(msg, param_attr);
genlmsg_cancel:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static void devlink_param_notify(struct devlink *devlink,
unsigned int port_index,
struct devlink_param_item *param_item,
enum devlink_command cmd)
{
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL &&
cmd != DEVLINK_CMD_PORT_PARAM_NEW &&
cmd != DEVLINK_CMD_PORT_PARAM_DEL);
/* devlink_notify_register() / devlink_notify_unregister()
* will replay the notifications if the params are added/removed
* outside of the lifetime of the instance.
*/
if (!devl_is_registered(devlink))
return;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd,
0, 0, 0);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int
devlink_nl_cmd_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_param_item *param_item;
int idx = 0;
int err = 0;
list_for_each_entry(param_item, &devlink->param_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_param_fill(msg, devlink, 0, param_item,
DEVLINK_CMD_PARAM_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err == -EOPNOTSUPP) {
err = 0;
} else if (err) {
state->idx = idx;
break;
}
idx++;
}
return err;
}
const struct devlink_cmd devl_cmd_param_get = {
.dump_one = devlink_nl_cmd_param_get_dump_one,
};
static int
devlink_param_type_get_from_info(struct genl_info *info,
enum devlink_param_type *param_type)
{
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
return -EINVAL;
switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
case NLA_U8:
*param_type = DEVLINK_PARAM_TYPE_U8;
break;
case NLA_U16:
*param_type = DEVLINK_PARAM_TYPE_U16;
break;
case NLA_U32:
*param_type = DEVLINK_PARAM_TYPE_U32;
break;
case NLA_STRING:
*param_type = DEVLINK_PARAM_TYPE_STRING;
break;
case NLA_FLAG:
*param_type = DEVLINK_PARAM_TYPE_BOOL;
break;
default:
return -EINVAL;
}
return 0;
}
static int
devlink_param_value_get_from_info(const struct devlink_param *param,
struct genl_info *info,
union devlink_param_value *value)
{
struct nlattr *param_data;
int len;
param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
return -EINVAL;
switch (param->type) {
case DEVLINK_PARAM_TYPE_U8:
if (nla_len(param_data) != sizeof(u8))
return -EINVAL;
value->vu8 = nla_get_u8(param_data);
break;
case DEVLINK_PARAM_TYPE_U16:
if (nla_len(param_data) != sizeof(u16))
return -EINVAL;
value->vu16 = nla_get_u16(param_data);
break;
case DEVLINK_PARAM_TYPE_U32:
if (nla_len(param_data) != sizeof(u32))
return -EINVAL;
value->vu32 = nla_get_u32(param_data);
break;
case DEVLINK_PARAM_TYPE_STRING:
len = strnlen(nla_data(param_data), nla_len(param_data));
if (len == nla_len(param_data) ||
len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
return -EINVAL;
strcpy(value->vstr, nla_data(param_data));
break;
case DEVLINK_PARAM_TYPE_BOOL:
if (param_data && nla_len(param_data))
return -EINVAL;
value->vbool = nla_get_flag(param_data);
break;
}
return 0;
}
static struct devlink_param_item *
devlink_param_get_from_info(struct list_head *param_list,
struct genl_info *info)
{
char *param_name;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME))
return NULL;
param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
return devlink_param_find_by_name(param_list, param_name);
}
static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_param_item *param_item;
struct sk_buff *msg;
int err;
param_item = devlink_param_get_from_info(&devlink->param_list, info);
if (!param_item)
return -EINVAL;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_param_fill(msg, devlink, 0, param_item,
DEVLINK_CMD_PARAM_GET,
info->snd_portid, info->snd_seq, 0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
unsigned int port_index,
struct list_head *param_list,
struct genl_info *info,
enum devlink_command cmd)
{
enum devlink_param_type param_type;
struct devlink_param_gset_ctx ctx;
enum devlink_param_cmode cmode;
struct devlink_param_item *param_item;
const struct devlink_param *param;
union devlink_param_value value;
int err = 0;
param_item = devlink_param_get_from_info(param_list, info);
if (!param_item)
return -EINVAL;
param = param_item->param;
err = devlink_param_type_get_from_info(info, &param_type);
if (err)
return err;
if (param_type != param->type)
return -EINVAL;
err = devlink_param_value_get_from_info(param, info, &value);
if (err)
return err;
if (param->validate) {
err = param->validate(devlink, param->id, value, info->extack);
if (err)
return err;
}
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
return -EINVAL;
cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
if (!devlink_param_cmode_is_supported(param, cmode))
return -EOPNOTSUPP;
if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
if (param->type == DEVLINK_PARAM_TYPE_STRING)
strcpy(param_item->driverinit_value.vstr, value.vstr);
else
param_item->driverinit_value = value;
param_item->driverinit_value_valid = true;
} else {
if (!param->set)
return -EOPNOTSUPP;
ctx.val = value;
ctx.cmode = cmode;
err = devlink_param_set(devlink, param, &ctx);
if (err)
return err;
}
devlink_param_notify(devlink, port_index, param_item, cmd);
return 0;
}
static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list,
info, DEVLINK_CMD_PARAM_NEW);
}
static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported");
return msg->len;
}
static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
return -EINVAL;
}
static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
return -EINVAL;
}
static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
struct devlink *devlink,
struct devlink_snapshot *snapshot)
{
struct nlattr *snap_attr;
int err;
snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
if (!snap_attr)
return -EINVAL;
err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
if (err)
goto nla_put_failure;
nla_nest_end(msg, snap_attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, snap_attr);
return err;
}
static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
struct devlink *devlink,
struct devlink_region *region)
{
struct devlink_snapshot *snapshot;
struct nlattr *snapshots_attr;
int err;
snapshots_attr = nla_nest_start_noflag(msg,
DEVLINK_ATTR_REGION_SNAPSHOTS);
if (!snapshots_attr)
return -EINVAL;
list_for_each_entry(snapshot, &region->snapshot_list, list) {
err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
if (err)
goto nla_put_failure;
}
nla_nest_end(msg, snapshots_attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, snapshots_attr);
return err;
}
static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags,
struct devlink_region *region)
{
void *hdr;
int err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
err = devlink_nl_put_handle(msg, devlink);
if (err)
goto nla_put_failure;
if (region->port) {
err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
region->port->index);
if (err)
goto nla_put_failure;
}
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
region->size,
DEVLINK_ATTR_PAD);
if (err)
goto nla_put_failure;
err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,
region->max_snapshots);
if (err)
goto nla_put_failure;
err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
if (err)
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return err;
}
static struct sk_buff *
devlink_nl_region_notify_build(struct devlink_region *region,
struct devlink_snapshot *snapshot,
enum devlink_command cmd, u32 portid, u32 seq)
{
struct devlink *devlink = region->devlink;
struct sk_buff *msg;
void *hdr;
int err;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return ERR_PTR(-ENOMEM);
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
if (!hdr) {
err = -EMSGSIZE;
goto out_free_msg;
}
err = devlink_nl_put_handle(msg, devlink);
if (err)
goto out_cancel_msg;
if (region->port) {
err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
region->port->index);
if (err)
goto out_cancel_msg;
}
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
region->ops->name);
if (err)
goto out_cancel_msg;
if (snapshot) {
err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
snapshot->id);
if (err)
goto out_cancel_msg;
} else {
err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
region->size, DEVLINK_ATTR_PAD);
if (err)
goto out_cancel_msg;
}
genlmsg_end(msg, hdr);
return msg;
out_cancel_msg:
genlmsg_cancel(msg, hdr);
out_free_msg:
nlmsg_free(msg);
return ERR_PTR(err);
}
static void devlink_nl_region_notify(struct devlink_region *region,
struct devlink_snapshot *snapshot,
enum devlink_command cmd)
{
struct devlink *devlink = region->devlink;
struct sk_buff *msg;
WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
if (IS_ERR(msg))
return;
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
/**
* __devlink_snapshot_id_increment - Increment number of snapshots using an id
* @devlink: devlink instance
* @id: the snapshot id
*
* Track when a new snapshot begins using an id. Load the count for the
* given id from the snapshot xarray, increment it, and store it back.
*
* Called when a new snapshot is created with the given id.
*
* The id *must* have been previously allocated by
* devlink_region_snapshot_id_get().
*
* Returns 0 on success, or an error on failure.
*/
static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
{
unsigned long count;
void *p;
int err;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
xa_lock(&devlink->snapshot_ids);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
p = xa_load(&devlink->snapshot_ids, id);
if (WARN_ON(!p)) {
err = -EINVAL;
goto unlock;
}
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
if (WARN_ON(!xa_is_value(p))) {
err = -EINVAL;
goto unlock;
}
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
count = xa_to_value(p);
count++;
err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
GFP_ATOMIC));
unlock:
xa_unlock(&devlink->snapshot_ids);
return err;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
}
/**
* __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
* @devlink: devlink instance
* @id: the snapshot id
*
* Track when a snapshot is deleted and stops using an id. Load the count
* for the given id from the snapshot xarray, decrement it, and store it
* back.
*
* If the count reaches zero, erase this id from the xarray, freeing it
* up for future re-use by devlink_region_snapshot_id_get().
*
* Called when a snapshot using the given id is deleted, and when the
* initial allocator of the id is finished using it.
*/
static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
{
unsigned long count;
void *p;
xa_lock(&devlink->snapshot_ids);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
p = xa_load(&devlink->snapshot_ids, id);
if (WARN_ON(!p))
goto unlock;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
if (WARN_ON(!xa_is_value(p)))
goto unlock;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
count = xa_to_value(p);
if (count > 1) {
count--;
__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
GFP_ATOMIC);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
} else {
/* If this was the last user, we can erase this id */
__xa_erase(&devlink->snapshot_ids, id);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
}
unlock:
xa_unlock(&devlink->snapshot_ids);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
}
/**
* __devlink_snapshot_id_insert - Insert a specific snapshot ID
* @devlink: devlink instance
* @id: the snapshot id
*
* Mark the given snapshot id as used by inserting a zero value into the
* snapshot xarray.
*
* This must be called while holding the devlink instance lock. Unlike
* devlink_snapshot_id_get, the initial reference count is zero, not one.
* It is expected that the id will immediately be used before
* releasing the devlink instance lock.
*
* Returns zero on success, or an error code if the snapshot id could not
* be inserted.
*/
static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
{
int err;
xa_lock(&devlink->snapshot_ids);
if (xa_load(&devlink->snapshot_ids, id)) {
xa_unlock(&devlink->snapshot_ids);
return -EEXIST;
}
err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
GFP_ATOMIC));
xa_unlock(&devlink->snapshot_ids);
return err;
}
/**
* __devlink_region_snapshot_id_get - get snapshot ID
* @devlink: devlink instance
* @id: storage to return snapshot id
*
* Allocates a new snapshot id. Returns zero on success, or a negative
* error on failure. Must be called while holding the devlink instance
* lock.
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
*
* Snapshot IDs are tracked using an xarray which stores the number of
* users of the snapshot id.
*
* Note that the caller of this function counts as a 'user', in order to
* avoid race conditions. The caller must release its hold on the
* snapshot by using devlink_region_snapshot_id_put.
*/
static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
xa_limit_32b, GFP_KERNEL);
}
/**
* __devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
* from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* Must be called only while holding the region snapshot lock.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
*/
static int
__devlink_region_snapshot_create(struct devlink_region *region,
u8 *data, u32 snapshot_id)
{
struct devlink *devlink = region->devlink;
struct devlink_snapshot *snapshot;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
int err;
lockdep_assert_held(&region->snapshot_lock);
/* check if region can hold one more snapshot */
if (region->cur_snapshots == region->max_snapshots)
return -ENOSPC;
if (devlink_region_snapshot_get_by_id(region, snapshot_id))
return -EEXIST;
snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
if (!snapshot)
return -ENOMEM;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
err = __devlink_snapshot_id_increment(devlink, snapshot_id);
if (err)
goto err_snapshot_id_increment;
snapshot->id = snapshot_id;
snapshot->region = region;
snapshot->data = data;
list_add_tail(&snapshot->list, &region->snapshot_list);
region->cur_snapshots++;
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
return 0;
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
err_snapshot_id_increment:
kfree(snapshot);
return err;
}
static void devlink_region_snapshot_del(struct devlink_region *region,
struct devlink_snapshot *snapshot)
{
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
struct devlink *devlink = region->devlink;
lockdep_assert_held(&region->snapshot_lock);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
region->cur_snapshots--;
list_del(&snapshot->list);
region->ops->destructor(snapshot->data);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
__devlink_snapshot_id_decrement(devlink, snapshot->id);
kfree(snapshot);
}
static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
struct sk_buff *msg;
unsigned int index;
int err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME))
return -EINVAL;
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
port = devlink_port_get_by_index(devlink, index);
if (!port)
return -ENODEV;
}
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
if (port)
region = devlink_port_region_get_by_name(port, region_name);
else
region = devlink_region_get_by_name(devlink, region_name);
if (!region)
return -EINVAL;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
info->snd_portid, info->snd_seq, 0,
region);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg,
struct netlink_callback *cb,
struct devlink_port *port,
int *idx,
int start)
{
struct devlink_region *region;
int err = 0;
list_for_each_entry(region, &port->region_list, list) {
if (*idx < start) {
(*idx)++;
continue;
}
err = devlink_nl_region_fill(msg, port->devlink,
DEVLINK_CMD_REGION_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, region);
if (err)
goto out;
(*idx)++;
}
out:
return err;
}
static int
devlink_nl_cmd_region_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_region *region;
struct devlink_port *port;
unsigned long port_index;
int idx = 0;
int err;
list_for_each_entry(region, &devlink->region_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_region_fill(msg, devlink,
DEVLINK_CMD_REGION_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, region);
if (err) {
state->idx = idx;
return err;
}
idx++;
}
xa_for_each(&devlink->ports, port_index, port) {
err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx,
state->idx);
if (err) {
state->idx = idx;
return err;
}
}
return 0;
}
const struct devlink_cmd devl_cmd_region_get = {
.dump_one = devlink_nl_cmd_region_get_dump_one,
};
static int devlink_nl_cmd_region_del(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_snapshot *snapshot;
struct devlink_port *port = NULL;
struct devlink_region *region;
const char *region_name;
unsigned int index;
u32 snapshot_id;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) ||
GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID))
return -EINVAL;
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
port = devlink_port_get_by_index(devlink, index);
if (!port)
return -ENODEV;
}
if (port)
region = devlink_port_region_get_by_name(port, region_name);
else
region = devlink_region_get_by_name(devlink, region_name);
if (!region)
return -EINVAL;
mutex_lock(&region->snapshot_lock);
snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
if (!snapshot) {
mutex_unlock(&region->snapshot_lock);
return -EINVAL;
}
devlink_region_snapshot_del(region, snapshot);
mutex_unlock(&region->snapshot_lock);
return 0;
}
static int
devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_snapshot *snapshot;
struct devlink_port *port = NULL;
struct nlattr *snapshot_id_attr;
struct devlink_region *region;
const char *region_name;
unsigned int index;
u32 snapshot_id;
u8 *data;
int err;
if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) {
NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
return -EINVAL;
}
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
port = devlink_port_get_by_index(devlink, index);
if (!port)
return -ENODEV;
}
if (port)
region = devlink_port_region_get_by_name(port, region_name);
else
region = devlink_region_get_by_name(devlink, region_name);
if (!region) {
NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
return -EINVAL;
}
if (!region->ops->snapshot) {
NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot");
return -EOPNOTSUPP;
}
mutex_lock(&region->snapshot_lock);
if (region->cur_snapshots == region->max_snapshots) {
NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots");
err = -ENOSPC;
goto unlock;
}
snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
if (snapshot_id_attr) {
snapshot_id = nla_get_u32(snapshot_id_attr);
if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
err = -EEXIST;
goto unlock;
}
err = __devlink_snapshot_id_insert(devlink, snapshot_id);
if (err)
goto unlock;
} else {
err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
if (err) {
NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id");
goto unlock;
}
}
if (port)
err = region->port_ops->snapshot(port, region->port_ops,
info->extack, &data);
else
err = region->ops->snapshot(devlink, region->ops,
info->extack, &data);
if (err)
goto err_snapshot_capture;
err = __devlink_region_snapshot_create(region, data, snapshot_id);
if (err)
goto err_snapshot_create;
if (!snapshot_id_attr) {
struct sk_buff *msg;
snapshot = devlink_region_snapshot_get_by_id(region,
snapshot_id);
if (WARN_ON(!snapshot)) {
err = -EINVAL;
goto unlock;
}
msg = devlink_nl_region_notify_build(region, snapshot,
DEVLINK_CMD_REGION_NEW,
info->snd_portid,
info->snd_seq);
err = PTR_ERR_OR_ZERO(msg);
if (err)
goto err_notify;
err = genlmsg_reply(msg, info);
if (err)
goto err_notify;
}
mutex_unlock(&region->snapshot_lock);
return 0;
err_snapshot_create:
region->ops->destructor(data);
err_snapshot_capture:
__devlink_snapshot_id_decrement(devlink, snapshot_id);
mutex_unlock(&region->snapshot_lock);
return err;
err_notify:
devlink_region_snapshot_del(region, snapshot);
unlock:
mutex_unlock(&region->snapshot_lock);
return err;
}
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
u8 *chunk, u32 chunk_size,
u64 addr)
{
struct nlattr *chunk_attr;
int err;
chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK);
if (!chunk_attr)
return -EINVAL;
err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
if (err)
goto nla_put_failure;
err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
DEVLINK_ATTR_PAD);
if (err)
goto nla_put_failure;
nla_nest_end(msg, chunk_attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, chunk_attr);
return err;
}
#define DEVLINK_REGION_READ_CHUNK_SIZE 256
typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size,
u64 curr_offset,
struct netlink_ext_ack *extack);
static int
devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb,
void *cb_priv, u64 start_offset, u64 end_offset,
u64 *new_offset, struct netlink_ext_ack *extack)
{
u64 curr_offset = start_offset;
int err = 0;
u8 *data;
/* Allocate and re-use a single buffer */
data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL);
if (!data)
return -ENOMEM;
*new_offset = start_offset;
while (curr_offset < end_offset) {
u32 data_size;
data_size = min_t(u32, end_offset - curr_offset,
DEVLINK_REGION_READ_CHUNK_SIZE);
err = cb(cb_priv, data, data_size, curr_offset, extack);
if (err)
break;
err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset);
if (err)
break;
curr_offset += data_size;
}
*new_offset = curr_offset;
kfree(data);
return err;
}
static int
devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
u64 curr_offset,
struct netlink_ext_ack __always_unused *extack)
{
struct devlink_snapshot *snapshot = cb_priv;
memcpy(chunk, &snapshot->data[curr_offset], chunk_size);
return 0;
}
static int
devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
u64 curr_offset, struct netlink_ext_ack *extack)
{
struct devlink_region *region = cb_priv;
return region->port_ops->read(region->port, region->port_ops, extack,
curr_offset, chunk_size, chunk);
}
static int
devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
u64 curr_offset, struct netlink_ext_ack *extack)
{
struct devlink_region *region = cb_priv;
return region->ops->read(region->devlink, region->ops, extack,
curr_offset, chunk_size, chunk);
}
static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct nlattr *chunks_attr, *region_attr, *snapshot_attr;
u64 ret_offset, start_offset, end_offset = U64_MAX;
struct nlattr **attrs = info->attrs;
struct devlink_port *port = NULL;
devlink_chunk_fill_t *region_cb;
struct devlink_region *region;
const char *region_name;
struct devlink *devlink;
unsigned int index;
void *region_cb_priv;
void *hdr;
int err;
start_offset = state->start_offset;
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs);
if (IS_ERR(devlink))
return PTR_ERR(devlink);
if (!attrs[DEVLINK_ATTR_REGION_NAME]) {
NL_SET_ERR_MSG(cb->extack, "No region name provided");
err = -EINVAL;
goto out_unlock;
}
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
port = devlink_port_get_by_index(devlink, index);
if (!port) {
err = -ENODEV;
goto out_unlock;
}
}
region_attr = attrs[DEVLINK_ATTR_REGION_NAME];
region_name = nla_data(region_attr);
if (port)
region = devlink_port_region_get_by_name(port, region_name);
else
region = devlink_region_get_by_name(devlink, region_name);
if (!region) {
NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist");
err = -EINVAL;
goto out_unlock;
}
snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
if (!snapshot_attr) {
if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
NL_SET_ERR_MSG(cb->extack, "No snapshot id provided");
err = -EINVAL;
goto out_unlock;
}
if (!region->ops->read) {
NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read");
err = -EOPNOTSUPP;
goto out_unlock;
}
if (port)
region_cb = &devlink_region_port_direct_fill;
else
region_cb = &devlink_region_direct_fill;
region_cb_priv = region;
} else {
struct devlink_snapshot *snapshot;
u32 snapshot_id;
if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot");
err = -EINVAL;
goto out_unlock;
}
snapshot_id = nla_get_u32(snapshot_attr);
snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
if (!snapshot) {
NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist");
err = -EINVAL;
goto out_unlock;
}
region_cb = &devlink_region_snapshot_fill;
region_cb_priv = snapshot;
}
if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
if (!start_offset)
start_offset =
nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
}
if (end_offset > region->size)
end_offset = region->size;
/* return 0 if there is no further data to read */
if (start_offset == end_offset) {
err = 0;
goto out_unlock;
}
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
DEVLINK_CMD_REGION_READ);
if (!hdr) {
err = -EMSGSIZE;
goto out_unlock;
}
err = devlink_nl_put_handle(skb, devlink);
if (err)
goto nla_put_failure;
if (region->port) {
err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
region->port->index);
if (err)
goto nla_put_failure;
}
err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
if (err)
goto nla_put_failure;
chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS);
if (!chunks_attr) {
err = -EMSGSIZE;
goto nla_put_failure;
}
err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv,
start_offset, end_offset, &ret_offset,
cb->extack);
if (err && err != -EMSGSIZE)
goto nla_put_failure;
/* Check if there was any progress done to prevent infinite loop */
if (ret_offset == start_offset) {
err = -EINVAL;
goto nla_put_failure;
}
state->start_offset = ret_offset;
nla_nest_end(skb, chunks_attr);
genlmsg_end(skb, hdr);
devl_unlock(devlink);
devlink_put(devlink);
return skb->len;
nla_put_failure:
genlmsg_cancel(skb, hdr);
out_unlock:
devl_unlock(devlink);
devlink_put(devlink);
return err;
}
struct devlink_fmsg_item {
struct list_head list;
int attrtype;
u8 nla_type;
u16 len;
int value[];
};
struct devlink_fmsg {
struct list_head item_list;
bool putting_binary; /* This flag forces enclosing of binary data
* in an array brackets. It forces using
* of designated API:
* devlink_fmsg_binary_pair_nest_start()
* devlink_fmsg_binary_pair_nest_end()
*/
};
static struct devlink_fmsg *devlink_fmsg_alloc(void)
{
struct devlink_fmsg *fmsg;
fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL);
if (!fmsg)
return NULL;
INIT_LIST_HEAD(&fmsg->item_list);
return fmsg;
}
static void devlink_fmsg_free(struct devlink_fmsg *fmsg)
{
struct devlink_fmsg_item *item, *tmp;
list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) {
list_del(&item->list);
kfree(item);
}
kfree(fmsg);
}
static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg,
int attrtype)
{
struct devlink_fmsg_item *item;
item = kzalloc(sizeof(*item), GFP_KERNEL);
if (!item)
return -ENOMEM;
item->attrtype = attrtype;
list_add_tail(&item->list, &fmsg->item_list);
return 0;
}
int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
}
int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN)
static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
{
struct devlink_fmsg_item *item;
if (fmsg->putting_binary)
return -EINVAL;
if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE)
return -EMSGSIZE;
item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL);
if (!item)
return -ENOMEM;
item->nla_type = NLA_NUL_STRING;
item->len = strlen(name) + 1;
item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
memcpy(&item->value, name, item->len);
list_add_tail(&item->list, &fmsg->item_list);
return 0;
}
int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
{
int err;
if (fmsg->putting_binary)
return -EINVAL;
err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
if (err)
return err;
err = devlink_fmsg_put_name(fmsg, name);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
const char *name)
{
int err;
if (fmsg->putting_binary)
return -EINVAL;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start);
int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
{
int err;
if (fmsg->putting_binary)
return -EINVAL;
err = devlink_fmsg_nest_end(fmsg);
if (err)
return err;
err = devlink_fmsg_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
int devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
const char *name)
{
int err;
err = devlink_fmsg_arr_pair_nest_start(fmsg, name);
if (err)
return err;
fmsg->putting_binary = true;
return err;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
int devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
{
if (!fmsg->putting_binary)
return -EINVAL;
fmsg->putting_binary = false;
return devlink_fmsg_arr_pair_nest_end(fmsg);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
const void *value, u16 value_len,
u8 value_nla_type)
{
struct devlink_fmsg_item *item;
if (value_len > DEVLINK_FMSG_MAX_SIZE)
return -EMSGSIZE;
item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL);
if (!item)
return -ENOMEM;
item->nla_type = value_nla_type;
item->len = value_len;
item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
memcpy(&item->value, value, item->len);
list_add_tail(&item->list, &fmsg->item_list);
return 0;
}
static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG);
}
static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8);
}
int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64);
}
int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
{
if (fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
NLA_NUL_STRING);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
u16 value_len)
{
if (!fmsg->putting_binary)
return -EINVAL;
return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY);
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
bool value)
{
int err;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_bool_put(fmsg, value);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put);
int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
u8 value)
{
int err;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_u8_put(fmsg, value);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put);
int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
u32 value)
{
int err;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_u32_put(fmsg, value);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put);
int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
u64 value)
{
int err;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_u64_put(fmsg, value);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put);
int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
const char *value)
{
int err;
err = devlink_fmsg_pair_nest_start(fmsg, name);
if (err)
return err;
err = devlink_fmsg_string_put(fmsg, value);
if (err)
return err;
err = devlink_fmsg_pair_nest_end(fmsg);
if (err)
return err;
return 0;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
const void *value, u32 value_len)
{
u32 data_size;
int end_err;
u32 offset;
int err;
err = devlink_fmsg_binary_pair_nest_start(fmsg, name);
if (err)
return err;
for (offset = 0; offset < value_len; offset += data_size) {
data_size = value_len - offset;
if (data_size > DEVLINK_FMSG_MAX_SIZE)
data_size = DEVLINK_FMSG_MAX_SIZE;
err = devlink_fmsg_binary_put(fmsg, value + offset, data_size);
if (err)
break;
/* Exit from loop with a break (instead of
* return) to make sure putting_binary is turned off in
* devlink_fmsg_binary_pair_nest_end
*/
}
end_err = devlink_fmsg_binary_pair_nest_end(fmsg);
if (end_err)
err = end_err;
return err;
}
EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
static int
devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb)
{
switch (msg->nla_type) {
case NLA_FLAG:
case NLA_U8:
case NLA_U32:
case NLA_U64:
case NLA_NUL_STRING:
case NLA_BINARY:
return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
msg->nla_type);
default:
return -EINVAL;
}
}
static int
devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb)
{
int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
u8 tmp;
switch (msg->nla_type) {
case NLA_FLAG:
/* Always provide flag data, regardless of its value */
tmp = *(bool *) msg->value;
return nla_put_u8(skb, attrtype, tmp);
case NLA_U8:
return nla_put_u8(skb, attrtype, *(u8 *) msg->value);
case NLA_U32:
return nla_put_u32(skb, attrtype, *(u32 *) msg->value);
case NLA_U64:
return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value,
DEVLINK_ATTR_PAD);
case NLA_NUL_STRING:
return nla_put_string(skb, attrtype, (char *) &msg->value);
case NLA_BINARY:
return nla_put(skb, attrtype, msg->len, (void *) &msg->value);
default:
return -EINVAL;
}
}
static int
devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb,
int *start)
{
struct devlink_fmsg_item *item;
struct nlattr *fmsg_nlattr;
int err = 0;
int i = 0;
fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG);
if (!fmsg_nlattr)
return -EMSGSIZE;
list_for_each_entry(item, &fmsg->item_list, list) {
if (i < *start) {
i++;
continue;
}
switch (item->attrtype) {
case DEVLINK_ATTR_FMSG_OBJ_NEST_START:
case DEVLINK_ATTR_FMSG_PAIR_NEST_START:
case DEVLINK_ATTR_FMSG_ARR_NEST_START:
case DEVLINK_ATTR_FMSG_NEST_END:
err = nla_put_flag(skb, item->attrtype);
break;
case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA:
err = devlink_fmsg_item_fill_type(item, skb);
if (err)
break;
err = devlink_fmsg_item_fill_data(item, skb);
break;
case DEVLINK_ATTR_FMSG_OBJ_NAME:
err = nla_put_string(skb, item->attrtype,
(char *) &item->value);
break;
default:
err = -EINVAL;
break;
}
if (!err)
*start = ++i;
else
break;
}
nla_nest_end(skb, fmsg_nlattr);
return err;
}
static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
struct genl_info *info,
enum devlink_command cmd, int flags)
{
struct nlmsghdr *nlh;
struct sk_buff *skb;
bool last = false;
int index = 0;
void *hdr;
int err;
while (!last) {
int tmp_index = index;
skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
return -ENOMEM;
hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
&devlink_nl_family, flags | NLM_F_MULTI, cmd);
if (!hdr) {
err = -EMSGSIZE;
goto nla_put_failure;
}
err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
if (!err)
last = true;
else if (err != -EMSGSIZE || tmp_index == index)
goto nla_put_failure;
genlmsg_end(skb, hdr);
err = genlmsg_reply(skb, info);
if (err)
return err;
}
skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!skb)
return -ENOMEM;
nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
NLMSG_DONE, 0, flags | NLM_F_MULTI);
if (!nlh) {
err = -EMSGSIZE;
goto nla_put_failure;
}
return genlmsg_reply(skb, info);
nla_put_failure:
nlmsg_free(skb);
return err;
}
static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
struct netlink_callback *cb,
enum devlink_command cmd)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
int index = state->idx;
int tmp_index = index;
void *hdr;
int err;
hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
if (!hdr) {
err = -EMSGSIZE;
goto nla_put_failure;
}
err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
if ((err && err != -EMSGSIZE) || tmp_index == index)
goto nla_put_failure;
state->idx = index;
genlmsg_end(skb, hdr);
return skb->len;
nla_put_failure:
genlmsg_cancel(skb, hdr);
return err;
}
struct devlink_health_reporter {
struct list_head list;
void *priv;
const struct devlink_health_reporter_ops *ops;
struct devlink *devlink;
struct devlink_port *devlink_port;
struct devlink_fmsg *dump_fmsg;
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
u64 graceful_period;
bool auto_recover;
bool auto_dump;
u8 health_state;
u64 dump_ts;
u64 dump_real_ts;
u64 error_count;
u64 recovery_count;
u64 last_recovery_ts;
};
void *
devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
{
return reporter->priv;
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_priv);
static struct devlink_health_reporter *
__devlink_health_reporter_find_by_name(struct list_head *reporter_list,
const char *reporter_name)
{
struct devlink_health_reporter *reporter;
list_for_each_entry(reporter, reporter_list, list)
if (!strcmp(reporter->ops->name, reporter_name))
return reporter;
return NULL;
}
static struct devlink_health_reporter *
devlink_health_reporter_find_by_name(struct devlink *devlink,
const char *reporter_name)
{
return __devlink_health_reporter_find_by_name(&devlink->reporter_list,
reporter_name);
}
static struct devlink_health_reporter *
devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
const char *reporter_name)
{
return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list,
reporter_name);
}
static struct devlink_health_reporter *
__devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
if (WARN_ON(graceful_period && !ops->recover))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
if (!reporter)
return ERR_PTR(-ENOMEM);
reporter->priv = priv;
reporter->ops = ops;
reporter->devlink = devlink;
reporter->graceful_period = graceful_period;
reporter->auto_recover = !!ops->recover;
reporter->auto_dump = !!ops->dump;
mutex_init(&reporter->dump_lock);
return reporter;
}
/**
* devl_port_health_reporter_create - create devlink health reporter for
* specified port instance
*
* @port: devlink_port which should contain the new reporter
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
* @priv: priv
*/
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
devl_assert_locked(port->devlink);
if (__devlink_health_reporter_find_by_name(&port->reporter_list,
ops->name))
return ERR_PTR(-EEXIST);
reporter = __devlink_health_reporter_create(port->devlink, ops,
graceful_period, priv);
if (IS_ERR(reporter))
return reporter;
reporter->devlink_port = port;
list_add_tail(&reporter->list, &port->reporter_list);
return reporter;
}
EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
struct devlink *devlink = port->devlink;
devl_lock(devlink);
reporter = devl_port_health_reporter_create(port, ops,
graceful_period, priv);
devl_unlock(devlink);
return reporter;
}
EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
/**
* devl_health_reporter_create - create devlink health reporter
*
* @devlink: devlink
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
* @priv: priv
*/
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
devl_assert_locked(devlink);
if (devlink_health_reporter_find_by_name(devlink, ops->name))
return ERR_PTR(-EEXIST);
reporter = __devlink_health_reporter_create(devlink, ops,
graceful_period, priv);
if (IS_ERR(reporter))
return reporter;
list_add_tail(&reporter->list, &devlink->reporter_list);
return reporter;
}
EXPORT_SYMBOL_GPL(devl_health_reporter_create);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
devl_lock(devlink);
reporter = devl_health_reporter_create(devlink, ops,
graceful_period, priv);
devl_unlock(devlink);
return reporter;
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
static void
devlink_health_reporter_free(struct devlink_health_reporter *reporter)
{
mutex_destroy(&reporter->dump_lock);
if (reporter->dump_fmsg)
devlink_fmsg_free(reporter->dump_fmsg);
kfree(reporter);
}
/**
* devl_health_reporter_destroy - destroy devlink health reporter
*
* @reporter: devlink health reporter to destroy
*/
void
devl_health_reporter_destroy(struct devlink_health_reporter *reporter)
{
devl_assert_locked(reporter->devlink);
list_del(&reporter->list);
devlink_health_reporter_free(reporter);
}
EXPORT_SYMBOL_GPL(devl_health_reporter_destroy);
void
devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
{
struct devlink *devlink = reporter->devlink;
devl_lock(devlink);
devl_health_reporter_destroy(reporter);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
static int
devlink_nl_health_reporter_fill(struct sk_buff *msg,
struct devlink_health_reporter *reporter,
enum devlink_command cmd, u32 portid,
u32 seq, int flags)
{
struct devlink *devlink = reporter->devlink;
struct nlattr *reporter_attr;
void *hdr;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto genlmsg_cancel;
if (reporter->devlink_port) {
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index))
goto genlmsg_cancel;
}
reporter_attr = nla_nest_start_noflag(msg,
DEVLINK_ATTR_HEALTH_REPORTER);
if (!reporter_attr)
goto genlmsg_cancel;
if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
reporter->ops->name))
goto reporter_nest_cancel;
if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
reporter->health_state))
goto reporter_nest_cancel;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
reporter->error_count, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
reporter->recovery_count, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
reporter->graceful_period,
DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
reporter->auto_recover))
goto reporter_nest_cancel;
if (reporter->dump_fmsg &&
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
jiffies_to_msecs(reporter->dump_ts),
DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
if (reporter->dump_fmsg &&
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
reporter->dump_real_ts, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
if (reporter->ops->dump &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
reporter->auto_dump))
goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
return 0;
reporter_nest_cancel:
nla_nest_end(msg, reporter_attr);
genlmsg_cancel:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static void devlink_recover_notify(struct devlink_health_reporter *reporter,
enum devlink_command cmd)
{
struct devlink *devlink = reporter->devlink;
struct sk_buff *msg;
int err;
WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg,
0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
void
devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
{
reporter->recovery_count++;
reporter->last_recovery_ts = jiffies;
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
static int
devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
void *priv_ctx, struct netlink_ext_ack *extack)
{
int err;
if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
return 0;
if (!reporter->ops->recover)
return -EOPNOTSUPP;
err = reporter->ops->recover(reporter, priv_ctx, extack);
if (err)
return err;
devlink_health_reporter_recovery_done(reporter);
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
return 0;
}
static void
devlink_health_dump_clear(struct devlink_health_reporter *reporter)
{
if (!reporter->dump_fmsg)
return;
devlink_fmsg_free(reporter->dump_fmsg);
reporter->dump_fmsg = NULL;
}
static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
void *priv_ctx,
struct netlink_ext_ack *extack)
{
int err;
if (!reporter->ops->dump)
return 0;
if (reporter->dump_fmsg)
return 0;
reporter->dump_fmsg = devlink_fmsg_alloc();
if (!reporter->dump_fmsg) {
err = -ENOMEM;
return err;
}
err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
if (err)
goto dump_err;
err = reporter->ops->dump(reporter, reporter->dump_fmsg,
priv_ctx, extack);
if (err)
goto dump_err;
err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
if (err)
goto dump_err;
reporter->dump_ts = jiffies;
reporter->dump_real_ts = ktime_get_real_ns();
return 0;
dump_err:
devlink_health_dump_clear(reporter);
return err;
}
int devlink_health_report(struct devlink_health_reporter *reporter,
const char *msg, void *priv_ctx)
{
enum devlink_health_reporter_state prev_health_state;
struct devlink *devlink = reporter->devlink;
unsigned long recover_ts_threshold;
int ret;
/* write a log message of the current error */
WARN_ON(!msg);
trace_devlink_health_report(devlink, reporter->ops->name, msg);
reporter->error_count++;
prev_health_state = reporter->health_state;
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
/* abort if the previous error wasn't recovered */
recover_ts_threshold = reporter->last_recovery_ts +
msecs_to_jiffies(reporter->graceful_period);
if (reporter->auto_recover &&
(prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
(reporter->last_recovery_ts && reporter->recovery_count &&
time_is_after_jiffies(recover_ts_threshold)))) {
trace_devlink_health_recover_aborted(devlink,
reporter->ops->name,
reporter->health_state,
jiffies -
reporter->last_recovery_ts);
return -ECANCELED;
}
if (reporter->auto_dump) {
mutex_lock(&reporter->dump_lock);
/* store current dump of current error, for later analysis */
devlink_health_do_dump(reporter, priv_ctx, NULL);
mutex_unlock(&reporter->dump_lock);
}
if (!reporter->auto_recover)
return 0;
devl_lock(devlink);
ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL);
devl_unlock(devlink);
return ret;
}
EXPORT_SYMBOL_GPL(devlink_health_report);
static struct devlink_health_reporter *
devlink_health_reporter_get_from_attrs(struct devlink *devlink,
struct nlattr **attrs)
{
struct devlink_port *devlink_port;
char *reporter_name;
if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
return NULL;
reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
devlink_port = devlink_port_get_from_attrs(devlink, attrs);
if (IS_ERR(devlink_port))
return devlink_health_reporter_find_by_name(devlink,
reporter_name);
else
return devlink_port_health_reporter_find_by_name(devlink_port,
reporter_name);
}
static struct devlink_health_reporter *
devlink_health_reporter_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
}
static struct devlink_health_reporter *
devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
{
const struct genl_dumpit_info *info = genl_dumpit_info(cb);
struct devlink_health_reporter *reporter;
struct nlattr **attrs = info->attrs;
struct devlink *devlink;
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs);
if (IS_ERR(devlink))
return NULL;
devl_unlock(devlink);
reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
devlink_put(devlink);
return reporter;
}
void
devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
enum devlink_health_reporter_state state)
{
if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
return;
if (reporter->health_state == state)
return;
reporter->health_state = state;
trace_devlink_health_reporter_state_update(reporter->devlink,
reporter->ops->name, state);
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
struct sk_buff *msg;
int err;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_health_reporter_fill(msg, reporter,
DEVLINK_CMD_HEALTH_REPORTER_GET,
info->snd_portid, info->snd_seq,
0);
if (err) {
nlmsg_free(msg);
return err;
}
return genlmsg_reply(msg, info);
}
static int
devlink_nl_cmd_health_reporter_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_health_reporter *reporter;
struct devlink_port *port;
unsigned long port_index;
int idx = 0;
int err;
list_for_each_entry(reporter, &devlink->reporter_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_health_reporter_fill(msg, reporter,
DEVLINK_CMD_HEALTH_REPORTER_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
return err;
}
idx++;
}
xa_for_each(&devlink->ports, port_index, port) {
list_for_each_entry(reporter, &port->reporter_list, list) {
if (idx < state->idx) {
idx++;
continue;
}
err = devlink_nl_health_reporter_fill(msg, reporter,
DEVLINK_CMD_HEALTH_REPORTER_GET,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
return err;
}
idx++;
}
}
return 0;
}
const struct devlink_cmd devl_cmd_health_reporter_get = {
.dump_one = devlink_nl_cmd_health_reporter_get_dump_one,
};
static int
devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
if (!reporter->ops->recover &&
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
return -EOPNOTSUPP;
if (!reporter->ops->dump &&
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
return -EOPNOTSUPP;
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
reporter->graceful_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
reporter->auto_recover =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
reporter->auto_dump =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
return 0;
}
static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
return devlink_health_reporter_recover(reporter, NULL, info->extack);
}
static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
struct devlink_fmsg *fmsg;
int err;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
if (!reporter->ops->diagnose)
return -EOPNOTSUPP;
fmsg = devlink_fmsg_alloc();
if (!fmsg)
return -ENOMEM;
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
goto out;
err = reporter->ops->diagnose(reporter, fmsg, info->extack);
if (err)
goto out;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
goto out;
err = devlink_fmsg_snd(fmsg, info,
DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
out:
devlink_fmsg_free(fmsg);
return err;
}
static int
devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_health_reporter *reporter;
int err;
reporter = devlink_health_reporter_get_from_cb(cb);
if (!reporter)
return -EINVAL;
if (!reporter->ops->dump)
return -EOPNOTSUPP;
mutex_lock(&reporter->dump_lock);
if (!state->idx) {
err = devlink_health_do_dump(reporter, NULL, cb->extack);
if (err)
goto unlock;
state->dump_ts = reporter->dump_ts;
}
if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) {
NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry");
err = -EAGAIN;
goto unlock;
}
err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
unlock:
mutex_unlock(&reporter->dump_lock);
return err;
}
static int
devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
if (!reporter->ops->dump)
return -EOPNOTSUPP;
mutex_lock(&reporter->dump_lock);
devlink_health_dump_clear(reporter);
mutex_unlock(&reporter->dump_lock);
return 0;
}
static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
reporter = devlink_health_reporter_get_from_info(devlink, info);
if (!reporter)
return -EINVAL;
if (!reporter->ops->test)
return -EOPNOTSUPP;
return reporter->ops->test(reporter, info->extack);
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
struct devlink_stats {
u64_stats_t rx_bytes;
u64_stats_t rx_packets;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
struct u64_stats_sync syncp;
};
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
/**
* struct devlink_trap_policer_item - Packet trap policer attributes.
* @policer: Immutable packet trap policer attributes.
* @rate: Rate in packets / sec.
* @burst: Burst size in packets.
* @list: trap_policer_list member.
*
* Describes packet trap policer attributes. Created by devlink during trap
* policer registration.
*/
struct devlink_trap_policer_item {
const struct devlink_trap_policer *policer;
u64 rate;
u64 burst;
struct list_head list;
};
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
/**
* struct devlink_trap_group_item - Packet trap group attributes.
* @group: Immutable packet trap group attributes.
* @policer_item: Associated policer item. Can be NULL.
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
* @list: trap_group_list member.
* @stats: Trap group statistics.
*
* Describes packet trap group attributes. Created by devlink during trap
* group registration.
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
*/
struct devlink_trap_group_item {
const struct devlink_trap_group *group;
struct devlink_trap_policer_item *policer_item;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
struct list_head list;
struct devlink_stats __percpu *stats;
};
/**
* struct devlink_trap_item - Packet trap attributes.
* @trap: Immutable packet trap attributes.
* @group_item: Associated group item.
* @list: trap_list member.
* @action: Trap action.
* @stats: Trap statistics.
* @priv: Driver private information.
*
* Describes both mutable and immutable packet trap attributes. Created by
* devlink during trap registration and used for all trap related operations.
*/
struct devlink_trap_item {
const struct devlink_trap *trap;
struct devlink_trap_group_item *group_item;
struct list_head list;
enum devlink_trap_action action;
struct devlink_stats __percpu *stats;
void *priv;
};
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
static struct devlink_trap_policer_item *
devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
{
struct devlink_trap_policer_item *policer_item;
list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
if (policer_item->policer->id == id)
return policer_item;
}
return NULL;
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
static struct devlink_trap_item *
devlink_trap_item_lookup(struct devlink *devlink, const char *name)
{
struct devlink_trap_item *trap_item;
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (!strcmp(trap_item->trap->name, name))
return trap_item;
}
return NULL;
}
static struct devlink_trap_item *
devlink_trap_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
struct nlattr *attr;
if (!info->attrs[DEVLINK_ATTR_TRAP_NAME])
return NULL;
attr = info->attrs[DEVLINK_ATTR_TRAP_NAME];
return devlink_trap_item_lookup(devlink, nla_data(attr));
}
static int
devlink_trap_action_get_from_info(struct genl_info *info,
enum devlink_trap_action *p_trap_action)
{
u8 val;
val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
switch (val) {
case DEVLINK_TRAP_ACTION_DROP:
case DEVLINK_TRAP_ACTION_TRAP:
case DEVLINK_TRAP_ACTION_MIRROR:
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
*p_trap_action = val;
break;
default:
return -EINVAL;
}
return 0;
}
static int devlink_trap_metadata_put(struct sk_buff *msg,
const struct devlink_trap *trap)
{
struct nlattr *attr;
attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA);
if (!attr)
return -EMSGSIZE;
if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
goto nla_put_failure;
if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
goto nla_put_failure;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
nla_nest_end(msg, attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, attr);
return -EMSGSIZE;
}
static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
struct devlink_stats *stats)
{
int i;
memset(stats, 0, sizeof(*stats));
for_each_possible_cpu(i) {
struct devlink_stats *cpu_stats;
u64 rx_packets, rx_bytes;
unsigned int start;
cpu_stats = per_cpu_ptr(trap_stats, i);
do {
start = u64_stats_fetch_begin(&cpu_stats->syncp);
rx_packets = u64_stats_read(&cpu_stats->rx_packets);
rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
} while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
u64_stats_add(&stats->rx_packets, rx_packets);
u64_stats_add(&stats->rx_bytes, rx_bytes);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
}
static int
devlink_trap_group_stats_put(struct sk_buff *msg,
struct devlink_stats __percpu *trap_stats)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
struct devlink_stats stats;
struct nlattr *attr;
devlink_trap_stats_read(trap_stats, &stats);
attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
if (!attr)
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
u64_stats_read(&stats.rx_packets),
DEVLINK_ATTR_PAD))
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
u64_stats_read(&stats.rx_bytes),
DEVLINK_ATTR_PAD))
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
goto nla_put_failure;
nla_nest_end(msg, attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, attr);
return -EMSGSIZE;
}
static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_item *trap_item)
{
struct devlink_stats stats;
struct nlattr *attr;
u64 drops = 0;
int err;
if (devlink->ops->trap_drop_counter_get) {
err = devlink->ops->trap_drop_counter_get(devlink,
trap_item->trap,
&drops);
if (err)
return err;
}
devlink_trap_stats_read(trap_item->stats, &stats);
attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
if (!attr)
return -EMSGSIZE;
if (devlink->ops->trap_drop_counter_get &&
nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
u64_stats_read(&stats.rx_packets),
DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES,
u64_stats_read(&stats.rx_bytes),
DEVLINK_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, attr);
return -EMSGSIZE;
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd, u32 portid, u32 seq,
int flags)
{
struct devlink_trap_group_item *group_item = trap_item->group_item;
void *hdr;
int err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
group_item->group->name))
goto nla_put_failure;
if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name))
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type))
goto nla_put_failure;
if (trap_item->trap->generic &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action))
goto nla_put_failure;
err = devlink_trap_metadata_put(msg, trap_item->trap);
if (err)
goto nla_put_failure;
err = devlink_trap_stats_put(msg, devlink, trap_item);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
if (err)
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_trap_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_item *trap_item;
struct sk_buff *msg;
int err;
if (list_empty(&devlink->trap_list))
return -EOPNOTSUPP;
trap_item = devlink_trap_item_get_from_info(devlink, info);
if (!trap_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
return -ENOENT;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_trap_fill(msg, devlink, trap_item,
DEVLINK_CMD_TRAP_NEW, info->snd_portid,
info->snd_seq, 0);
if (err)
goto err_trap_fill;
return genlmsg_reply(msg, info);
err_trap_fill:
nlmsg_free(msg);
return err;
}
static int
devlink_nl_cmd_trap_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
struct netlink_callback *cb)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_trap_item *trap_item;
int idx = 0;
int err = 0;
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (idx < state->idx) {
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
idx++;
continue;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
err = devlink_nl_trap_fill(msg, devlink, trap_item,
DEVLINK_CMD_TRAP_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
break;
}
idx++;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
return err;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
const struct devlink_cmd devl_cmd_trap_get = {
.dump_one = devlink_nl_cmd_trap_get_dump_one,
};
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
static int __devlink_trap_action_set(struct devlink *devlink,
struct devlink_trap_item *trap_item,
enum devlink_trap_action trap_action,
struct netlink_ext_ack *extack)
{
int err;
if (trap_item->action != trap_action &&
trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) {
NL_SET_ERR_MSG_MOD(extack, "Cannot change action of non-drop traps. Skipping");
return 0;
}
err = devlink->ops->trap_action_set(devlink, trap_item->trap,
trap_action, extack);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
if (err)
return err;
trap_item->action = trap_action;
return 0;
}
static int devlink_trap_action_set(struct devlink *devlink,
struct devlink_trap_item *trap_item,
struct genl_info *info)
{
enum devlink_trap_action trap_action;
int err;
if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
return 0;
err = devlink_trap_action_get_from_info(info, &trap_action);
if (err) {
NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
return -EINVAL;
}
return __devlink_trap_action_set(devlink, trap_item, trap_action,
info->extack);
}
static int devlink_nl_cmd_trap_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_item *trap_item;
if (list_empty(&devlink->trap_list))
return -EOPNOTSUPP;
trap_item = devlink_trap_item_get_from_info(devlink, info);
if (!trap_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap");
return -ENOENT;
}
return devlink_trap_action_set(devlink, trap_item, info);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
static struct devlink_trap_group_item *
devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
{
struct devlink_trap_group_item *group_item;
list_for_each_entry(group_item, &devlink->trap_group_list, list) {
if (!strcmp(group_item->group->name, name))
return group_item;
}
return NULL;
}
static struct devlink_trap_group_item *
devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
{
struct devlink_trap_group_item *group_item;
list_for_each_entry(group_item, &devlink->trap_group_list, list) {
if (group_item->group->id == id)
return group_item;
}
return NULL;
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
static struct devlink_trap_group_item *
devlink_trap_group_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
char *name;
if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME])
return NULL;
name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]);
return devlink_trap_group_item_lookup(devlink, name);
}
static int
devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_group_item *group_item,
enum devlink_command cmd, u32 portid, u32 seq,
int flags)
{
void *hdr;
int err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
group_item->group->name))
goto nla_put_failure;
if (group_item->group->generic &&
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
if (group_item->policer_item &&
nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
group_item->policer_item->policer->id))
goto nla_put_failure;
err = devlink_trap_group_stats_put(msg, group_item->stats);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
if (err)
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_trap_group_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
struct sk_buff *msg;
int err;
if (list_empty(&devlink->trap_group_list))
return -EOPNOTSUPP;
group_item = devlink_trap_group_item_get_from_info(devlink, info);
if (!group_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
return -ENOENT;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_trap_group_fill(msg, devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_NEW,
info->snd_portid, info->snd_seq, 0);
if (err)
goto err_trap_group_fill;
return genlmsg_reply(msg, info);
err_trap_group_fill:
nlmsg_free(msg);
return err;
}
static int
devlink_nl_cmd_trap_group_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_trap_group_item *group_item;
int idx = 0;
int err = 0;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
list_for_each_entry(group_item, &devlink->trap_group_list, list) {
if (idx < state->idx) {
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
idx++;
continue;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
err = devlink_nl_trap_group_fill(msg, devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
break;
}
idx++;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
return err;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
const struct devlink_cmd devl_cmd_trap_group_get = {
.dump_one = devlink_nl_cmd_trap_group_get_dump_one,
};
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
static int
__devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
enum devlink_trap_action trap_action,
struct netlink_ext_ack *extack)
{
const char *group_name = group_item->group->name;
struct devlink_trap_item *trap_item;
int err;
if (devlink->ops->trap_group_action_set) {
err = devlink->ops->trap_group_action_set(devlink, group_item->group,
trap_action, extack);
if (err)
return err;
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (strcmp(trap_item->group_item->group->name, group_name))
continue;
if (trap_item->action != trap_action &&
trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP)
continue;
trap_item->action = trap_action;
}
return 0;
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
list_for_each_entry(trap_item, &devlink->trap_list, list) {
if (strcmp(trap_item->group_item->group->name, group_name))
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
continue;
err = __devlink_trap_action_set(devlink, trap_item,
trap_action, extack);
if (err)
return err;
}
return 0;
}
static int
devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
struct genl_info *info, bool *p_modified)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
enum devlink_trap_action trap_action;
int err;
if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
return 0;
err = devlink_trap_action_get_from_info(info, &trap_action);
if (err) {
NL_SET_ERR_MSG_MOD(info->extack, "Invalid trap action");
return -EINVAL;
}
err = __devlink_trap_group_action_set(devlink, group_item, trap_action,
info->extack);
if (err)
return err;
*p_modified = true;
return 0;
}
static int devlink_trap_group_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
struct genl_info *info)
{
struct devlink_trap_policer_item *policer_item;
struct netlink_ext_ack *extack = info->extack;
const struct devlink_trap_policer *policer;
struct nlattr **attrs = info->attrs;
u32 policer_id;
int err;
if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
return 0;
if (!devlink->ops->trap_group_set)
return -EOPNOTSUPP;
policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
if (policer_id && !policer_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
return -ENOENT;
}
policer = policer_item ? policer_item->policer : NULL;
err = devlink->ops->trap_group_set(devlink, group_item->group, policer,
extack);
if (err)
return err;
group_item->policer_item = policer_item;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
return 0;
}
static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
bool modified = false;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
int err;
if (list_empty(&devlink->trap_group_list))
return -EOPNOTSUPP;
group_item = devlink_trap_group_item_get_from_info(devlink, info);
if (!group_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap group");
return -ENOENT;
}
err = devlink_trap_group_action_set(devlink, group_item, info,
&modified);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
if (err)
return err;
err = devlink_trap_group_set(devlink, group_item, info);
if (err)
goto err_trap_group_set;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
return 0;
err_trap_group_set:
if (modified)
NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already");
return err;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
static struct devlink_trap_policer_item *
devlink_trap_policer_item_get_from_info(struct devlink *devlink,
struct genl_info *info)
{
u32 id;
if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
return NULL;
id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
return devlink_trap_policer_item_lookup(devlink, id);
}
static int
devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_policer *policer)
{
struct nlattr *attr;
u64 drops;
int err;
if (!devlink->ops->trap_policer_counter_get)
return 0;
err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
if (err)
return err;
attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
if (!attr)
return -EMSGSIZE;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
DEVLINK_ATTR_PAD))
goto nla_put_failure;
nla_nest_end(msg, attr);
return 0;
nla_put_failure:
nla_nest_cancel(msg, attr);
return -EMSGSIZE;
}
static int
devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
const struct devlink_trap_policer_item *policer_item,
enum devlink_command cmd, u32 portid, u32 seq,
int flags)
{
void *hdr;
int err;
hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
if (!hdr)
return -EMSGSIZE;
if (devlink_nl_put_handle(msg, devlink))
goto nla_put_failure;
if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
policer_item->policer->id))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
policer_item->rate, DEVLINK_ATTR_PAD))
goto nla_put_failure;
if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
policer_item->burst, DEVLINK_ATTR_PAD))
goto nla_put_failure;
err = devlink_trap_policer_stats_put(msg, devlink,
policer_item->policer);
if (err)
goto nla_put_failure;
genlmsg_end(msg, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_trap_policer_item *policer_item;
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct sk_buff *msg;
int err;
if (list_empty(&devlink->trap_policer_list))
return -EOPNOTSUPP;
policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
if (!policer_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
return -ENOENT;
}
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_NEW,
info->snd_portid, info->snd_seq, 0);
if (err)
goto err_trap_policer_fill;
return genlmsg_reply(msg, info);
err_trap_policer_fill:
nlmsg_free(msg);
return err;
}
static int
devlink_nl_cmd_trap_policer_get_dump_one(struct sk_buff *msg,
struct devlink *devlink,
struct netlink_callback *cb)
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
{
struct devlink_nl_dump_state *state = devlink_dump_state(cb);
struct devlink_trap_policer_item *policer_item;
int idx = 0;
int err = 0;
list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
if (idx < state->idx) {
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
idx++;
continue;
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
}
err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_NEW,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI);
if (err) {
state->idx = idx;
break;
}
idx++;
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
}
return err;
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
}
const struct devlink_cmd devl_cmd_trap_policer_get = {
.dump_one = devlink_nl_cmd_trap_policer_get_dump_one,
};
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
static int
devlink_trap_policer_set(struct devlink *devlink,
struct devlink_trap_policer_item *policer_item,
struct genl_info *info)
{
struct netlink_ext_ack *extack = info->extack;
struct nlattr **attrs = info->attrs;
u64 rate, burst;
int err;
rate = policer_item->rate;
burst = policer_item->burst;
if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
if (rate < policer_item->policer->min_rate) {
NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit");
return -EINVAL;
}
if (rate > policer_item->policer->max_rate) {
NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit");
return -EINVAL;
}
if (burst < policer_item->policer->min_burst) {
NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit");
return -EINVAL;
}
if (burst > policer_item->policer->max_burst) {
NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit");
return -EINVAL;
}
err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
rate, burst, info->extack);
if (err)
return err;
policer_item->rate = rate;
policer_item->burst = burst;
return 0;
}
static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
struct devlink_trap_policer_item *policer_item;
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
if (list_empty(&devlink->trap_policer_list))
return -EOPNOTSUPP;
if (!devlink->ops->trap_policer_set)
return -EOPNOTSUPP;
policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
if (!policer_item) {
NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
return -ENOENT;
}
return devlink_trap_policer_set(devlink, policer_item, info);
}
const struct genl_small_ops devlink_nl_ops[56] = {
{
.cmd = DEVLINK_CMD_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_PORT_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_PORT_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_RATE_GET,
.doit = devlink_nl_cmd_rate_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RATE_SET,
.doit = devlink_nl_cmd_rate_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE,
},
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
{
.cmd = DEVLINK_CMD_RATE_NEW,
.doit = devlink_nl_cmd_rate_new_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_RATE_DEL,
.doit = devlink_nl_cmd_rate_del_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_RATE_NODE,
},
{
.cmd = DEVLINK_CMD_PORT_SPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_split_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_PORT_UNSPLIT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_unsplit_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
devlink: Support add and delete devlink port Extended devlink interface for the user to add and delete a port. Extend devlink to connect user requests to driver to add/delete a port in the device. Driver routines are invoked without holding devlink instance lock. This enables driver to perform several devlink objects registration, unregistration such as (port, health reporter, resource etc) by using existing devlink APIs. This also helps to uniformly use the code for port unregistration during driver unload and during port deletion initiated by user. Examples of add, show and delete commands: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ udevadm test-builtin net_id /sys/class/net/eth6 Load module index Parsed configuration file /usr/lib/systemd/network/99-default.link Created link configuration context. Using default interface naming scheme 'v245'. ID_NET_NAMING_SCHEME=v245 ID_NET_NAME_PATH=enp6s0f0npf0sf88 ID_NET_NAME_SLOT=ens2f0npf0sf88 Unload module index Unloaded link configuration context. Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:14 -08:00
{
.cmd = DEVLINK_CMD_PORT_NEW,
.doit = devlink_nl_cmd_port_new_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_PORT_DEL,
.doit = devlink_nl_cmd_port_del_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_LINECARD_GET,
.doit = devlink_nl_cmd_linecard_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
/* can be retrieved by unprivileged users */
},
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
{
.cmd = DEVLINK_CMD_LINECARD_SET,
.doit = devlink_nl_cmd_linecard_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD,
},
{
.cmd = DEVLINK_CMD_SB_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_SB_POOL_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_pool_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_SB_POOL_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_pool_set_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_port_pool_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_port_pool_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_occ_snapshot_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_sb_occ_max_clear_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_ESWITCH_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_get_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_ESWITCH_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_set_doit,
.flags = GENL_ADMIN_PERM,
},
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
.doit = devlink_nl_cmd_dpipe_table_get,
/* can be retrieved by unprivileged users */
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
},
{
.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
.doit = devlink_nl_cmd_dpipe_entries_get,
/* can be retrieved by unprivileged users */
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
},
{
.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
.doit = devlink_nl_cmd_dpipe_headers_get,
/* can be retrieved by unprivileged users */
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
.doit = devlink_nl_cmd_dpipe_table_counters_set,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_RESOURCE_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_resource_set,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_RESOURCE_DUMP,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_resource_dump,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RELOAD,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_reload,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_PARAM_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_param_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_PARAM_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_param_set_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_PORT_PARAM_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_param_get_doit,
.dumpit = devlink_nl_cmd_port_param_get_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_PORT_PARAM_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_port_param_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
},
{
.cmd = DEVLINK_CMD_REGION_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_REGION_NEW,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_new,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_REGION_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_del,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_REGION_READ,
.validate = GENL_DONT_VALIDATE_STRICT |
GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_region_read_dumpit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_INFO_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_info_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_set_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_recover_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_diagnose_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
.validate = GENL_DONT_VALIDATE_STRICT |
GENL_DONT_VALIDATE_DUMP_STRICT,
.dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_dump_clear_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_health_reporter_test_doit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT,
},
{
.cmd = DEVLINK_CMD_FLASH_UPDATE,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_flash_update,
.flags = GENL_ADMIN_PERM,
},
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
.cmd = DEVLINK_CMD_TRAP_GET,
.doit = devlink_nl_cmd_trap_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_TRAP_SET,
.doit = devlink_nl_cmd_trap_set_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_TRAP_GROUP_GET,
.doit = devlink_nl_cmd_trap_group_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_TRAP_GROUP_SET,
.doit = devlink_nl_cmd_trap_group_set_doit,
.flags = GENL_ADMIN_PERM,
},
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
{
.cmd = DEVLINK_CMD_TRAP_POLICER_GET,
.doit = devlink_nl_cmd_trap_policer_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_TRAP_POLICER_SET,
.doit = devlink_nl_cmd_trap_policer_set_doit,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = DEVLINK_CMD_SELFTESTS_GET,
.doit = devlink_nl_cmd_selftests_get_doit,
.dumpit = devlink_nl_instance_iter_dumpit,
/* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_SELFTESTS_RUN,
.doit = devlink_nl_cmd_selftests_run,
.flags = GENL_ADMIN_PERM,
},
/* -- No new ops here! Use split ops going forward! -- */
};
static void
devlink_trap_policer_notify(struct devlink *devlink,
const struct devlink_trap_policer_item *policer_item,
enum devlink_command cmd);
static void
devlink_trap_group_notify(struct devlink *devlink,
const struct devlink_trap_group_item *group_item,
enum devlink_command cmd);
static void devlink_trap_notify(struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd);
void devlink_notify_register(struct devlink *devlink)
{
struct devlink_trap_policer_item *policer_item;
struct devlink_trap_group_item *group_item;
struct devlink_param_item *param_item;
struct devlink_trap_item *trap_item;
struct devlink_port *devlink_port;
struct devlink_linecard *linecard;
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
struct devlink_rate *rate_node;
struct devlink_region *region;
unsigned long port_index;
devlink_notify(devlink, DEVLINK_CMD_NEW);
list_for_each_entry(linecard, &devlink->linecard_list, list)
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
xa_for_each(&devlink->ports, port_index, devlink_port)
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
list_for_each_entry(policer_item, &devlink->trap_policer_list, list)
devlink_trap_policer_notify(devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_NEW);
list_for_each_entry(group_item, &devlink->trap_group_list, list)
devlink_trap_group_notify(devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_NEW);
list_for_each_entry(trap_item, &devlink->trap_list, list)
devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
list_for_each_entry(rate_node, &devlink->rate_list, list)
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
list_for_each_entry(region, &devlink->region_list, list)
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
list_for_each_entry(param_item, &devlink->param_list, list)
devlink_param_notify(devlink, 0, param_item,
DEVLINK_CMD_PARAM_NEW);
}
void devlink_notify_unregister(struct devlink *devlink)
{
struct devlink_trap_policer_item *policer_item;
struct devlink_trap_group_item *group_item;
struct devlink_param_item *param_item;
struct devlink_trap_item *trap_item;
struct devlink_port *devlink_port;
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
struct devlink_rate *rate_node;
struct devlink_region *region;
unsigned long port_index;
list_for_each_entry_reverse(param_item, &devlink->param_list, list)
devlink_param_notify(devlink, 0, param_item,
DEVLINK_CMD_PARAM_DEL);
devlink: Add missed notifications iterators The commit mentioned in Fixes line missed a couple of notifications that were registered before devlink_register() and should be delayed too. As such, the too early placed WARN_ON() check spotted it. WARNING: CPU: 1 PID: 6540 at net/core/devlink.c:5158 devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Modules linked in: CPU: 1 PID: 6540 Comm: syz-executor.0 Not tainted 5.15.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:devlink_nl_region_notify+0x184/0x1e0 net/core/devlink.c:5158 Code: 38 41 b8 c0 0c 00 00 31 d2 48 89 ee 4c 89 e7 e8 72 1a 26 00 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e e9 01 bd 41 fa e8 fc bc 41 fa <0f> 0b e9 f7 fe ff ff e8 f0 bc 41 fa 0f 0b eb da 4c 89 e7 e8 c4 18 RSP: 0018:ffffc90002d6f658 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88801f08d580 RSI: ffffffff87344e94 RDI: 0000000000000003 RBP: ffff88801ee42100 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff87344d8a R11: 0000000000000000 R12: ffff88801c1dc000 R13: 0000000000000000 R14: 000000000000002c R15: ffff88801c1dc070 FS: 0000555555e8e400(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055dd7c590310 CR3: 0000000069a09000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: devlink_region_create+0x39f/0x4c0 net/core/devlink.c:10327 nsim_dev_dummy_region_init drivers/net/netdevsim/dev.c:481 [inline] nsim_dev_probe+0x5f6/0x1150 drivers/net/netdevsim/dev.c:1479 call_driver_probe drivers/base/dd.c:517 [inline] really_probe+0x245/0xcc0 drivers/base/dd.c:596 __driver_probe_device+0x338/0x4d0 drivers/base/dd.c:751 driver_probe_device+0x4c/0x1a0 drivers/base/dd.c:781 __device_attach_driver+0x20b/0x2f0 drivers/base/dd.c:898 bus_for_each_drv+0x15f/0x1e0 drivers/base/bus.c:427 __device_attach+0x228/0x4a0 drivers/base/dd.c:969 bus_probe_device+0x1e4/0x290 drivers/base/bus.c:487 device_add+0xc35/0x21b0 drivers/base/core.c:3359 nsim_bus_dev_new drivers/net/netdevsim/bus.c:435 [inline] new_device_store+0x48b/0x770 drivers/net/netdevsim/bus.c:302 bus_attr_store+0x72/0xa0 drivers/base/bus.c:122 sysfs_kf_write+0x110/0x160 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x342/0x500 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:2163 [inline] new_sync_write+0x429/0x660 fs/read_write.c:507 vfs_write+0x7cf/0xae0 fs/read_write.c:594 ksys_write+0x12d/0x250 fs/read_write.c:647 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f328409d3ef Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007ffdc6851140 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f328409d3ef RDX: 0000000000000003 RSI: 00007ffdc6851190 RDI: 0000000000000004 RBP: 0000000000000004 R08: 0000000000000000 R09: 00007ffdc68510e0 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f3284144971 R13: 00007ffdc6851190 R14: 0000000000000000 R15: 00007ffdc6851860 Fixes: cf530217408e ("devlink: Notify users when objects are accessible") Reported-by: Eric Dumazet <eric.dumazet@gmail.com> Tested-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Link: https://lore.kernel.org/r/2ed1159291f2a589b013914f2b60d8172fc525c1.1632925030.git.leonro@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-09-29 17:18:20 +03:00
list_for_each_entry_reverse(region, &devlink->region_list, list)
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
list_for_each_entry_reverse(rate_node, &devlink->rate_list, list)
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
list_for_each_entry_reverse(trap_item, &devlink->trap_list, list)
devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list)
devlink_trap_group_notify(devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_DEL);
list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list,
list)
devlink_trap_policer_notify(devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_DEL);
xa_for_each(&devlink->ports, port_index, devlink_port)
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
devlink_notify(devlink, DEVLINK_CMD_DEL);
}
static void devlink_port_type_warn(struct work_struct *work)
{
WARN(true, "Type was not set for devlink port.");
}
static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
{
/* Ignore CPU and DSA flavours. */
return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
}
devlink: Wait longer before warning about unset port type The commit cited below causes devlink to emit a warning if a type was not set on a devlink port for longer than 30 seconds to "prevent misbehavior of drivers". This proved to be problematic when unregistering the backing netdev. The flow is always: devlink_port_type_clear() // schedules the warning unregister_netdev() // blocking devlink_port_unregister() // cancels the warning The call to unregister_netdev() can block for long periods of time for various reasons: RTNL lock is contended, large amounts of configuration to unroll following dismantle of the netdev, etc. This results in devlink emitting a warning despite the driver behaving correctly. In emulated environments (of future hardware) which are usually very slow, the warning can also be emitted during port creation as more than 30 seconds can pass between the time the devlink port is registered and when its type is set. In addition, syzbot has hit this warning [1] 1974 times since 07/11/19 without being able to produce a reproducer. Probably because reproduction depends on the load or other bugs (e.g., RTNL not being released). To prevent bogus warnings, increase the timeout to 1 hour. [1] https://syzkaller.appspot.com/bug?id=e99b59e9c024a666c9f7450dc162a4b74d09d9cb Fixes: 136bf27fc0e9 ("devlink: add warning in case driver does not set port type") Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reported-by: syzbot+b0a18ed7b08b735d2f41@syzkaller.appspotmail.com Reported-by: Alex Veber <alexve@mellanox.com> Tested-by: Alex Veber <alexve@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-01-09 19:57:41 +02:00
#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
{
if (!devlink_port_type_should_warn(devlink_port))
return;
/* Schedule a work to WARN in case driver does not set port
* type within timeout.
*/
schedule_delayed_work(&devlink_port->type_warn_dw,
DEVLINK_PORT_TYPE_WARN_TIMEOUT);
}
static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
{
if (!devlink_port_type_should_warn(devlink_port))
return;
cancel_delayed_work_sync(&devlink_port->type_warn_dw);
}
/**
* devlink_port_init() - Init devlink port
*
* @devlink: devlink
* @devlink_port: devlink port
*
* Initialize essencial stuff that is needed for functions
* that may be called before devlink port registration.
* Call to this function is optional and not needed
* in case the driver does not use such functions.
*/
void devlink_port_init(struct devlink *devlink,
struct devlink_port *devlink_port)
{
if (devlink_port->initialized)
return;
devlink_port->devlink = devlink;
INIT_LIST_HEAD(&devlink_port->region_list);
devlink_port->initialized = true;
}
EXPORT_SYMBOL_GPL(devlink_port_init);
/**
* devlink_port_fini() - Deinitialize devlink port
*
* @devlink_port: devlink port
*
* Deinitialize essencial stuff that is in use for functions
* that may be called after devlink port unregistration.
* Call to this function is optional and not needed
* in case the driver does not use such functions.
*/
void devlink_port_fini(struct devlink_port *devlink_port)
{
WARN_ON(!list_empty(&devlink_port->region_list));
}
EXPORT_SYMBOL_GPL(devlink_port_fini);
/**
* devl_port_register() - Register devlink port
*
* @devlink: devlink
* @devlink_port: devlink port
* @port_index: driver-specific numerical identifier of the port
*
* Register devlink port with provided port index. User can use
* any indexing, even hw-related one. devlink_port structure
* is convenient to be embedded inside user driver private structure.
* Note that the caller should take care of zeroing the devlink_port
* structure.
*/
int devl_port_register(struct devlink *devlink,
struct devlink_port *devlink_port,
unsigned int port_index)
{
int err;
devl_assert_locked(devlink);
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
devlink_port_init(devlink, devlink_port);
devlink_port->registered = true;
devlink_port->index = port_index;
spin_lock_init(&devlink_port->type_lock);
INIT_LIST_HEAD(&devlink_port->reporter_list);
err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL);
if (err)
return err;
INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
}
EXPORT_SYMBOL_GPL(devl_port_register);
/**
* devlink_port_register - Register devlink port
*
* @devlink: devlink
* @devlink_port: devlink port
* @port_index: driver-specific numerical identifier of the port
*
* Register devlink port with provided port index. User can use
* any indexing, even hw-related one. devlink_port structure
* is convenient to be embedded inside user driver private structure.
* Note that the caller should take care of zeroing the devlink_port
* structure.
*
* Context: Takes and release devlink->lock <mutex>.
*/
int devlink_port_register(struct devlink *devlink,
struct devlink_port *devlink_port,
unsigned int port_index)
{
int err;
devl_lock(devlink);
err = devl_port_register(devlink, devlink_port, port_index);
devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_port_register);
/**
* devl_port_unregister() - Unregister devlink port
*
* @devlink_port: devlink port
*/
void devl_port_unregister(struct devlink_port *devlink_port)
{
lockdep_assert_held(&devlink_port->devlink->lock);
WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET);
devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
xa_erase(&devlink_port->devlink->ports, devlink_port->index);
WARN_ON(!list_empty(&devlink_port->reporter_list));
devlink_port->registered = false;
}
EXPORT_SYMBOL_GPL(devl_port_unregister);
/**
* devlink_port_unregister - Unregister devlink port
*
* @devlink_port: devlink port
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
devl_lock(devlink);
devl_port_unregister(devlink_port);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);
static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
struct net_device *netdev)
{
const struct net_device_ops *ops = netdev->netdev_ops;
/* If driver registers devlink port, it should set devlink port
* attributes accordingly so the compat functions are called
* and the original ops are not used.
*/
if (ops->ndo_get_phys_port_name) {
/* Some drivers use the same set of ndos for netdevs
* that have devlink_port registered and also for
* those who don't. Make sure that ndo_get_phys_port_name
* returns -EOPNOTSUPP here in case it is defined.
* Warn if not.
*/
char name[IFNAMSIZ];
int err;
err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name));
WARN_ON(err != -EOPNOTSUPP);
}
if (ops->ndo_get_port_parent_id) {
/* Some drivers use the same set of ndos for netdevs
* that have devlink_port registered and also for
* those who don't. Make sure that ndo_get_port_parent_id
* returns -EOPNOTSUPP here in case it is defined.
* Warn if not.
*/
struct netdev_phys_item_id ppid;
int err;
err = ops->ndo_get_port_parent_id(netdev, &ppid);
WARN_ON(err != -EOPNOTSUPP);
}
}
static void __devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type type,
void *type_dev)
{
struct net_device *netdev = type_dev;
ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
if (type == DEVLINK_PORT_TYPE_NOTSET) {
devlink_port_type_warn_schedule(devlink_port);
} else {
devlink_port_type_warn_cancel(devlink_port);
if (type == DEVLINK_PORT_TYPE_ETH && netdev)
devlink_port_type_netdev_checks(devlink_port, netdev);
}
spin_lock_bh(&devlink_port->type_lock);
devlink_port->type = type;
switch (type) {
case DEVLINK_PORT_TYPE_ETH:
devlink_port->type_eth.netdev = netdev;
if (netdev) {
ASSERT_RTNL();
devlink_port->type_eth.ifindex = netdev->ifindex;
BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
sizeof(netdev->name));
strcpy(devlink_port->type_eth.ifname, netdev->name);
}
break;
case DEVLINK_PORT_TYPE_IB:
devlink_port->type_ib.ibdev = type_dev;
break;
default:
break;
}
spin_unlock_bh(&devlink_port->type_lock);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}
/**
* devlink_port_type_eth_set - Set port type to Ethernet
*
* @devlink_port: devlink port
*
* If driver is calling this, most likely it is doing something wrong.
*/
void devlink_port_type_eth_set(struct devlink_port *devlink_port)
{
dev_warn(devlink_port->devlink->dev,
"devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
devlink_port->index);
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
/**
* devlink_port_type_ib_set - Set port type to InfiniBand
*
* @devlink_port: devlink port
* @ibdev: related IB device
*/
void devlink_port_type_ib_set(struct devlink_port *devlink_port,
struct ib_device *ibdev)
{
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev);
}
EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
/**
* devlink_port_type_clear - Clear port type
*
* @devlink_port: devlink port
*
* If driver is calling this for clearing Ethernet type, most likely
* it is doing something wrong.
*/
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
dev_warn(devlink_port->devlink->dev,
"devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
devlink_port->index);
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
int devlink_port_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct devlink_port *devlink_port = netdev->devlink_port;
struct devlink *devlink;
devlink = container_of(nb, struct devlink, netdevice_nb);
if (!devlink_port || devlink_port->devlink != devlink)
return NOTIFY_OK;
switch (event) {
case NETDEV_POST_INIT:
/* Set the type but not netdev pointer. It is going to be set
* later on by NETDEV_REGISTER event. Happens once during
* netdevice register
*/
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
NULL);
break;
case NETDEV_REGISTER:
case NETDEV_CHANGENAME:
/* Set the netdev on top of previously set type. Note this
* event happens also during net namespace change so here
* we take into account netdev pointer appearing in this
* namespace.
*/
devlink: Fix warning when unregistering a port When a devlink port is unregistered, its type is expected to be unset or otherwise a WARNING is generated [1]. This was supposed to be handled by cited commit by clearing the type upon 'NETDEV_PRE_UNINIT'. The assumption was that no other events can be generated for the netdev after this event, but this proved to be wrong. After the event is generated, netdev_wait_allrefs_any() will rebroadcast a 'NETDEV_UNREGISTER' until the netdev's reference count drops to 1. This causes devlink to set the port type back to Ethernet. Fix by only setting and clearing the port type upon 'NETDEV_POST_INIT' and 'NETDEV_PRE_UNINIT', respectively. For all other events, preserve the port type. [1] WARNING: CPU: 0 PID: 11 at net/core/devlink.c:9998 devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 Modules linked in: CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted 6.1.0-rc3-next-20221107-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: netns cleanup_net RIP: 0010:devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 [...] Call Trace: <TASK> __nsim_dev_port_del+0x1bb/0x240 drivers/net/netdevsim/dev.c:1433 nsim_dev_port_del_all drivers/net/netdevsim/dev.c:1443 [inline] nsim_dev_reload_destroy+0x171/0x510 drivers/net/netdevsim/dev.c:1660 nsim_dev_reload_down+0x6b/0xd0 drivers/net/netdevsim/dev.c:968 devlink_reload+0x1c2/0x6b0 net/core/devlink.c:4501 devlink_pernet_pre_exit+0x104/0x1c0 net/core/devlink.c:12609 ops_pre_exit_list net/core/net_namespace.c:159 [inline] cleanup_net+0x451/0xb10 net/core/net_namespace.c:594 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 </TASK> Fixes: 02a68a47eade ("net: devlink: track netdev with devlink_port assigned") Reported-by: syzbot+85e47e1a08b3e159b159@syzkaller.appspotmail.com Reported-by: syzbot+c2ca18f0fccdd1f09c66@syzkaller.appspotmail.com Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Link: https://lore.kernel.org/r/20221110085150.520800-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-11-10 10:51:50 +02:00
__devlink_port_type_set(devlink_port, devlink_port->type,
netdev);
break;
case NETDEV_UNREGISTER:
/* Clear netdev pointer, but not the type. This event happens
* also during net namespace change so we need to clear
* pointer to netdev that is going to another net namespace.
*/
devlink: Fix warning when unregistering a port When a devlink port is unregistered, its type is expected to be unset or otherwise a WARNING is generated [1]. This was supposed to be handled by cited commit by clearing the type upon 'NETDEV_PRE_UNINIT'. The assumption was that no other events can be generated for the netdev after this event, but this proved to be wrong. After the event is generated, netdev_wait_allrefs_any() will rebroadcast a 'NETDEV_UNREGISTER' until the netdev's reference count drops to 1. This causes devlink to set the port type back to Ethernet. Fix by only setting and clearing the port type upon 'NETDEV_POST_INIT' and 'NETDEV_PRE_UNINIT', respectively. For all other events, preserve the port type. [1] WARNING: CPU: 0 PID: 11 at net/core/devlink.c:9998 devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 Modules linked in: CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted 6.1.0-rc3-next-20221107-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: netns cleanup_net RIP: 0010:devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 [...] Call Trace: <TASK> __nsim_dev_port_del+0x1bb/0x240 drivers/net/netdevsim/dev.c:1433 nsim_dev_port_del_all drivers/net/netdevsim/dev.c:1443 [inline] nsim_dev_reload_destroy+0x171/0x510 drivers/net/netdevsim/dev.c:1660 nsim_dev_reload_down+0x6b/0xd0 drivers/net/netdevsim/dev.c:968 devlink_reload+0x1c2/0x6b0 net/core/devlink.c:4501 devlink_pernet_pre_exit+0x104/0x1c0 net/core/devlink.c:12609 ops_pre_exit_list net/core/net_namespace.c:159 [inline] cleanup_net+0x451/0xb10 net/core/net_namespace.c:594 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 </TASK> Fixes: 02a68a47eade ("net: devlink: track netdev with devlink_port assigned") Reported-by: syzbot+85e47e1a08b3e159b159@syzkaller.appspotmail.com Reported-by: syzbot+c2ca18f0fccdd1f09c66@syzkaller.appspotmail.com Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Link: https://lore.kernel.org/r/20221110085150.520800-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-11-10 10:51:50 +02:00
__devlink_port_type_set(devlink_port, devlink_port->type,
NULL);
break;
case NETDEV_PRE_UNINIT:
/* Clear the type and the netdev pointer. Happens one during
* netdevice unregister.
*/
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
NULL);
break;
}
return NOTIFY_OK;
}
static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
enum devlink_port_flavour flavour)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
devlink_port->attrs_set = true;
attrs->flavour = flavour;
if (attrs->switch_id.id_len) {
devlink_port->switch_port = true;
if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN))
attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN;
} else {
devlink_port->switch_port = false;
}
return 0;
}
/**
* devlink_port_attrs_set - Set port attributes
*
* @devlink_port: devlink port
* @attrs: devlink port attrs
*/
void devlink_port_attrs_set(struct devlink_port *devlink_port,
struct devlink_port_attrs *attrs)
{
int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
devlink_port->attrs = *attrs;
ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
if (ret)
return;
WARN_ON(attrs->splittable && attrs->split);
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
/**
* devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
*
* @devlink_port: devlink port
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
* @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @external: indicates if the port is for an external controller
*/
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_PF);
if (ret)
return;
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
attrs->pci_pf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
/**
* devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
*
* @devlink_port: devlink port
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
* @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @vf: associated VF of a PF for the devlink port instance
* @external: indicates if the port is for an external controller
*/
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_VF);
if (ret)
return;
devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-09 07:50:37 +03:00
attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
attrs->pci_vf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
/**
* devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
*
* @devlink_port: devlink port
* @controller: associated controller number for the devlink port instance
* @pf: associated PF for the devlink port instance
* @sf: associated SF of a PF for the devlink port instance
* @external: indicates if the port is for an external controller
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
*/
void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
u16 pf, u32 sf, bool external)
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_SF);
if (ret)
return;
attrs->pci_sf.controller = controller;
attrs->pci_sf.pf = pf;
attrs->pci_sf.sf = sf;
attrs->pci_sf.external = external;
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
/**
* devl_rate_node_create - create devlink rate node
* @devlink: devlink instance
* @priv: driver private data
* @node_name: name of the resulting node
* @parent: parent devlink_rate struct
*
* Create devlink rate object of type node
*/
struct devlink_rate *
devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
struct devlink_rate *parent)
{
struct devlink_rate *rate_node;
rate_node = devlink_rate_node_get_by_name(devlink, node_name);
if (!IS_ERR(rate_node))
return ERR_PTR(-EEXIST);
rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
if (!rate_node)
return ERR_PTR(-ENOMEM);
if (parent) {
rate_node->parent = parent;
refcount_inc(&rate_node->parent->refcnt);
}
rate_node->type = DEVLINK_RATE_TYPE_NODE;
rate_node->devlink = devlink;
rate_node->priv = priv;
rate_node->name = kstrdup(node_name, GFP_KERNEL);
if (!rate_node->name) {
kfree(rate_node);
return ERR_PTR(-ENOMEM);
}
refcount_set(&rate_node->refcnt, 1);
list_add(&rate_node->list, &devlink->rate_list);
devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
return rate_node;
}
EXPORT_SYMBOL_GPL(devl_rate_node_create);
/**
* devl_rate_leaf_create - create devlink rate leaf
* @devlink_port: devlink port object to create rate object on
* @priv: driver private data
* @parent: parent devlink_rate struct
*
* Create devlink rate object of type leaf on provided @devlink_port.
*/
int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
struct devlink_rate *parent)
{
struct devlink *devlink = devlink_port->devlink;
struct devlink_rate *devlink_rate;
devl_assert_locked(devlink_port->devlink);
if (WARN_ON(devlink_port->devlink_rate))
return -EBUSY;
devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
if (!devlink_rate)
return -ENOMEM;
if (parent) {
devlink_rate->parent = parent;
refcount_inc(&devlink_rate->parent->refcnt);
}
devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
devlink_rate->devlink = devlink;
devlink_rate->devlink_port = devlink_port;
devlink_rate->priv = priv;
list_add_tail(&devlink_rate->list, &devlink->rate_list);
devlink_port->devlink_rate = devlink_rate;
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
return 0;
}
EXPORT_SYMBOL_GPL(devl_rate_leaf_create);
/**
* devl_rate_leaf_destroy - destroy devlink rate leaf
*
* @devlink_port: devlink port linked to the rate object
*
* Destroy the devlink rate object of type leaf on provided @devlink_port.
*/
void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
{
struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
devl_assert_locked(devlink_port->devlink);
if (!devlink_rate)
return;
devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
if (devlink_rate->parent)
refcount_dec(&devlink_rate->parent->refcnt);
list_del(&devlink_rate->list);
devlink_port->devlink_rate = NULL;
kfree(devlink_rate);
}
EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
/**
* devl_rate_nodes_destroy - destroy all devlink rate nodes on device
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
* @devlink: devlink instance
*
* Unset parent for all rate objects and destroy all rate nodes
* on specified device.
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
*/
void devl_rate_nodes_destroy(struct devlink *devlink)
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
{
static struct devlink_rate *devlink_rate, *tmp;
const struct devlink_ops *ops = devlink->ops;
devl_assert_locked(devlink);
list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
if (!devlink_rate->parent)
continue;
refcount_dec(&devlink_rate->parent->refcnt);
if (devlink_rate_is_leaf(devlink_rate))
ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
else if (devlink_rate_is_node(devlink_rate))
ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
NULL, NULL);
}
devlink: Introduce rate nodes Implement support for DEVLINK_CMD_RATE_{NEW|DEL} commands that are used to create and delete devlink rate nodes. Add new attribute DEVLINK_ATTR_RATE_NODE_NAME that specify node name string. The node name is an alphanumeric identifier. No valid node name can be a devlink port index, eg. decimal number. Extend devlink ops with new callbacks rate_node_{new|del}() and rate_node_tx_{share|max}_set() to allow supporting drivers to implement ports rate grouping and setting tx rate of rate nodes through devlink. Expose devlink_rate_nodes_destroy() function to allow vendor driver do proper cleanup of internally allocated resources for the nodes if the driver goes down or due to any other reasons which requires nodes to be destroyed. Disallow moving device from switchdev to legacy mode if any node exists on that device. User must explicitly delete nodes before switching mode. Example: $ devlink port function rate add netdevsim/netdevsim10/group1 $ devlink port function rate set netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit Add + set command can be combined: $ devlink port function rate add netdevsim/netdevsim10/group1 \ tx_share 10mbit tx_max 100mbit $ devlink port function rate show netdevsim/netdevsim10/group1 netdevsim/netdevsim10/group1: type node tx_share 10mbit tx_max 100mbit $ devlink port function rate del netdevsim/netdevsim10/group1 Co-developed-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-02 15:17:25 +03:00
list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
if (devlink_rate_is_node(devlink_rate)) {
ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
list_del(&devlink_rate->list);
kfree(devlink_rate->name);
kfree(devlink_rate);
}
}
}
EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
/**
* devlink_port_linecard_set - Link port with a linecard
*
* @devlink_port: devlink port
* @linecard: devlink linecard
*/
void devlink_port_linecard_set(struct devlink_port *devlink_port,
struct devlink_linecard *linecard)
{
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
devlink_port->linecard = linecard;
}
EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int n = 0;
if (!devlink_port->attrs_set)
return -EOPNOTSUPP;
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
if (devlink_port->linecard)
n = snprintf(name, len, "l%u",
devlink_port->linecard->index);
if (n < len)
n += snprintf(name + n, len - n, "p%u",
attrs->phys.port_number);
if (n < len && attrs->split)
n += snprintf(name + n, len - n, "s%u",
attrs->phys.split_subport_number);
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
case DEVLINK_PORT_FLAVOUR_UNUSED:
/* As CPU and DSA ports do not have a netdevice associated
* case should not ever happen.
*/
WARN_ON(1);
return -EINVAL;
case DEVLINK_PORT_FLAVOUR_PCI_PF:
if (attrs->pci_pf.external) {
n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
if (n >= len)
return -EINVAL;
len -= n;
name += n;
}
n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
break;
case DEVLINK_PORT_FLAVOUR_PCI_VF:
if (attrs->pci_vf.external) {
n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
if (n >= len)
return -EINVAL;
len -= n;
name += n;
}
n = snprintf(name, len, "pf%uvf%u",
attrs->pci_vf.pf, attrs->pci_vf.vf);
break;
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
case DEVLINK_PORT_FLAVOUR_PCI_SF:
if (attrs->pci_sf.external) {
n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
if (n >= len)
return -EINVAL;
len -= n;
name += n;
}
devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit <parav@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Vu Pham <vuhuong@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
2020-12-11 22:12:13 -08:00
n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
attrs->pci_sf.sf);
break;
case DEVLINK_PORT_FLAVOUR_VIRTUAL:
return -EOPNOTSUPP;
}
if (n >= len)
return -EINVAL;
return 0;
}
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
static int devlink_linecard_types_init(struct devlink_linecard *linecard)
{
struct devlink_linecard_type *linecard_type;
unsigned int count;
int i;
count = linecard->ops->types_count(linecard, linecard->priv);
linecard->types = kmalloc_array(count, sizeof(*linecard_type),
GFP_KERNEL);
if (!linecard->types)
return -ENOMEM;
linecard->types_count = count;
for (i = 0; i < count; i++) {
linecard_type = &linecard->types[i];
linecard->ops->types_get(linecard, linecard->priv, i,
&linecard_type->type,
&linecard_type->priv);
}
return 0;
}
static void devlink_linecard_types_fini(struct devlink_linecard *linecard)
{
kfree(linecard->types);
}
/**
* devl_linecard_create - Create devlink linecard
*
* @devlink: devlink
* @linecard_index: driver-specific numerical identifier of the linecard
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
* @ops: linecards ops
* @priv: user priv pointer
*
* Create devlink linecard instance with provided linecard index.
* Caller can use any indexing, even hw-related one.
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
*
* Return: Line card structure or an ERR_PTR() encoded error code.
*/
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
struct devlink_linecard *
devl_linecard_create(struct devlink *devlink, unsigned int linecard_index,
const struct devlink_linecard_ops *ops, void *priv)
{
struct devlink_linecard *linecard;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
int err;
if (WARN_ON(!ops || !ops->provision || !ops->unprovision ||
!ops->types_count || !ops->types_get))
return ERR_PTR(-EINVAL);
if (devlink_linecard_index_exists(devlink, linecard_index))
return ERR_PTR(-EEXIST);
linecard = kzalloc(sizeof(*linecard), GFP_KERNEL);
if (!linecard)
return ERR_PTR(-ENOMEM);
linecard->devlink = devlink;
linecard->index = linecard_index;
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
linecard->ops = ops;
linecard->priv = priv;
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
mutex_init(&linecard->state_lock);
err = devlink_linecard_types_init(linecard);
if (err) {
mutex_destroy(&linecard->state_lock);
kfree(linecard);
return ERR_PTR(err);
}
list_add_tail(&linecard->list, &devlink->linecard_list);
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
return linecard;
}
EXPORT_SYMBOL_GPL(devl_linecard_create);
/**
* devl_linecard_destroy - Destroy devlink linecard
*
* @linecard: devlink linecard
*/
void devl_linecard_destroy(struct devlink_linecard *linecard)
{
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
list_del(&linecard->list);
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
devlink_linecard_types_fini(linecard);
mutex_destroy(&linecard->state_lock);
kfree(linecard);
}
EXPORT_SYMBOL_GPL(devl_linecard_destroy);
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
/**
* devlink_linecard_provision_set - Set provisioning on linecard
*
* @linecard: devlink linecard
* @type: linecard type
*
* This is either called directly from the provision() op call or
* as a result of the provision() op call asynchronously.
*/
void devlink_linecard_provision_set(struct devlink_linecard *linecard,
const char *type)
{
mutex_lock(&linecard->state_lock);
WARN_ON(linecard->type && strcmp(linecard->type, type));
linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
linecard->type = type;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
/**
* devlink_linecard_provision_clear - Clear provisioning on linecard
*
* @linecard: devlink linecard
*
* This is either called directly from the unprovision() op call or
* as a result of the unprovision() op call asynchronously.
*/
void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
WARN_ON(linecard->nested_devlink);
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
linecard->type = NULL;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
/**
* devlink_linecard_provision_fail - Fail provisioning on linecard
*
* @linecard: devlink linecard
*
* This is either called directly from the provision() op call or
* as a result of the provision() op call asynchronously.
*/
void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
WARN_ON(linecard->nested_devlink);
devlink: implement line card provisioning In order to be able to configure all needed stuff on a port/netdevice of a line card without the line card being present, introduce line card provisioning. Basically by setting a type, provisioning process will start and driver is supposed to create a placeholder for instances (ports/netdevices) for a line card type. Allow the user to query the supported line card types over line card get command. Then implement two netlink command SET to allow user to set/unset the card type. On the driver API side, add provision/unprovision ops and supported types array to be advertised. Upon provision op call, the driver should take care of creating the instances for the particular line card type. Introduce provision_set/clear() functions to be called by the driver once the provisioning/unprovisioning is done on its side. These helpers are not to be called directly due to the async nature of provisioning. Example: $ devlink port # No ports are listed $ devlink lc pci/0000:01:00.0: lc 1 state unprovisioned supported_types: 16x100G lc 2 state unprovisioned supported_types: 16x100G lc 3 state unprovisioned supported_types: 16x100G lc 4 state unprovisioned supported_types: 16x100G lc 5 state unprovisioned supported_types: 16x100G lc 6 state unprovisioned supported_types: 16x100G lc 7 state unprovisioned supported_types: 16x100G lc 8 state unprovisioned supported_types: 16x100G $ devlink lc set pci/0000:01:00.0 lc 8 type 16x100G $ devlink lc show pci/0000:01:00.0 lc 8 pci/0000:01:00.0: lc 8 state active type 16x100G supported_types: 16x100G $ devlink port pci/0000:01:00.0/0: type notset flavour cpu port 0 splittable false pci/0000:01:00.0/53: type eth netdev enp1s0nl8p1 flavour physical lc 8 port 1 splittable true lanes 4 pci/0000:01:00.0/54: type eth netdev enp1s0nl8p2 flavour physical lc 8 port 2 splittable true lanes 4 pci/0000:01:00.0/55: type eth netdev enp1s0nl8p3 flavour physical lc 8 port 3 splittable true lanes 4 pci/0000:01:00.0/56: type eth netdev enp1s0nl8p4 flavour physical lc 8 port 4 splittable true lanes 4 pci/0000:01:00.0/57: type eth netdev enp1s0nl8p5 flavour physical lc 8 port 5 splittable true lanes 4 pci/0000:01:00.0/58: type eth netdev enp1s0nl8p6 flavour physical lc 8 port 6 splittable true lanes 4 pci/0000:01:00.0/59: type eth netdev enp1s0nl8p7 flavour physical lc 8 port 7 splittable true lanes 4 pci/0000:01:00.0/60: type eth netdev enp1s0nl8p8 flavour physical lc 8 port 8 splittable true lanes 4 pci/0000:01:00.0/61: type eth netdev enp1s0nl8p9 flavour physical lc 8 port 9 splittable true lanes 4 pci/0000:01:00.0/62: type eth netdev enp1s0nl8p10 flavour physical lc 8 port 10 splittable true lanes 4 pci/0000:01:00.0/63: type eth netdev enp1s0nl8p11 flavour physical lc 8 port 11 splittable true lanes 4 pci/0000:01:00.0/64: type eth netdev enp1s0nl8p12 flavour physical lc 8 port 12 splittable true lanes 4 pci/0000:01:00.0/125: type eth netdev enp1s0nl8p13 flavour physical lc 8 port 13 splittable true lanes 4 pci/0000:01:00.0/126: type eth netdev enp1s0nl8p14 flavour physical lc 8 port 14 splittable true lanes 4 pci/0000:01:00.0/127: type eth netdev enp1s0nl8p15 flavour physical lc 8 port 15 splittable true lanes 4 pci/0000:01:00.0/128: type eth netdev enp1s0nl8p16 flavour physical lc 8 port 16 splittable true lanes 4 $ devlink lc set pci/0000:01:00.0 lc 8 notype Signed-off-by: Jiri Pirko <jiri@nvidia.com> Signed-off-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-18 09:42:26 +03:00
linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
/**
* devlink_linecard_activate - Set linecard active
*
* @linecard: devlink linecard
*/
void devlink_linecard_activate(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED);
linecard->state = DEVLINK_LINECARD_STATE_ACTIVE;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_activate);
/**
* devlink_linecard_deactivate - Set linecard inactive
*
* @linecard: devlink linecard
*/
void devlink_linecard_deactivate(struct devlink_linecard *linecard)
{
mutex_lock(&linecard->state_lock);
switch (linecard->state) {
case DEVLINK_LINECARD_STATE_ACTIVE:
linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
break;
case DEVLINK_LINECARD_STATE_UNPROVISIONING:
/* Line card is being deactivated as part
* of unprovisioning flow.
*/
break;
default:
WARN_ON(1);
break;
}
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
/**
* devlink_linecard_nested_dl_set - Attach/detach nested devlink
* instance to linecard.
*
* @linecard: devlink linecard
* @nested_devlink: devlink instance to attach or NULL to detach
*/
void devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
struct devlink *nested_devlink)
{
mutex_lock(&linecard->state_lock);
linecard->nested_devlink = nested_devlink;
devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
mutex_unlock(&linecard->state_lock);
}
EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
u16 egress_pools_count, u16 ingress_tc_count,
u16 egress_tc_count)
{
struct devlink_sb *devlink_sb;
lockdep_assert_held(&devlink->lock);
if (devlink_sb_index_exists(devlink, sb_index))
return -EEXIST;
devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
if (!devlink_sb)
return -ENOMEM;
devlink_sb->index = sb_index;
devlink_sb->size = size;
devlink_sb->ingress_pools_count = ingress_pools_count;
devlink_sb->egress_pools_count = egress_pools_count;
devlink_sb->ingress_tc_count = ingress_tc_count;
devlink_sb->egress_tc_count = egress_tc_count;
list_add_tail(&devlink_sb->list, &devlink->sb_list);
return 0;
}
EXPORT_SYMBOL_GPL(devl_sb_register);
int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
u32 size, u16 ingress_pools_count,
u16 egress_pools_count, u16 ingress_tc_count,
u16 egress_tc_count)
{
int err;
devl_lock(devlink);
err = devl_sb_register(devlink, sb_index, size, ingress_pools_count,
egress_pools_count, ingress_tc_count,
egress_tc_count);
devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_sb_register);
void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index)
{
struct devlink_sb *devlink_sb;
lockdep_assert_held(&devlink->lock);
devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
WARN_ON(!devlink_sb);
list_del(&devlink_sb->list);
kfree(devlink_sb);
}
EXPORT_SYMBOL_GPL(devl_sb_unregister);
void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
{
devl_lock(devlink);
devl_sb_unregister(devlink, sb_index);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_sb_unregister);
/**
* devl_dpipe_headers_register - register dpipe headers
*
* @devlink: devlink
* @dpipe_headers: dpipe header array
*
* Register the headers supported by hardware.
*/
void devl_dpipe_headers_register(struct devlink *devlink,
struct devlink_dpipe_headers *dpipe_headers)
{
lockdep_assert_held(&devlink->lock);
devlink->dpipe_headers = dpipe_headers;
}
EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
/**
* devl_dpipe_headers_unregister - unregister dpipe headers
*
* @devlink: devlink
*
* Unregister the headers supported by hardware.
*/
void devl_dpipe_headers_unregister(struct devlink *devlink)
{
lockdep_assert_held(&devlink->lock);
devlink->dpipe_headers = NULL;
}
EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
/**
* devlink_dpipe_table_counter_enabled - check if counter allocation
* required
* @devlink: devlink
* @table_name: tables name
*
* Used by driver to check if counter allocation is required.
* After counter allocation is turned on the table entries
* are updated to include counter statistics.
*
* After that point on the driver must respect the counter
* state so that each entry added to the table is added
* with a counter.
*/
bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
const char *table_name)
{
struct devlink_dpipe_table *table;
bool enabled;
rcu_read_lock();
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
enabled = false;
if (table)
enabled = table->counters_enabled;
rcu_read_unlock();
return enabled;
}
EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
/**
* devl_dpipe_table_register - register dpipe table
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
*
* @devlink: devlink
* @table_name: table name
* @table_ops: table ops
* @priv: priv
* @counter_control_extern: external control for counters
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
*/
int devl_dpipe_table_register(struct devlink *devlink,
const char *table_name,
struct devlink_dpipe_table_ops *table_ops,
void *priv, bool counter_control_extern)
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
{
struct devlink_dpipe_table *table;
lockdep_assert_held(&devlink->lock);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (WARN_ON(!table_ops->size_get))
return -EINVAL;
if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
devlink))
return -EEXIST;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
table = kzalloc(sizeof(*table), GFP_KERNEL);
if (!table)
return -ENOMEM;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
table->name = table_name;
table->table_ops = table_ops;
table->priv = priv;
table->counter_control_extern = counter_control_extern;
list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
return 0;
}
EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
/**
* devl_dpipe_table_unregister - unregister dpipe table
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
*
* @devlink: devlink
* @table_name: table name
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
*/
void devl_dpipe_table_unregister(struct devlink *devlink,
const char *table_name)
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
{
struct devlink_dpipe_table *table;
lockdep_assert_held(&devlink->lock);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
if (!table)
return;
devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-28 17:24:10 +02:00
list_del_rcu(&table->list);
kfree_rcu(table, rcu);
}
EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
/**
* devl_resource_register - devlink resource register
*
* @devlink: devlink
* @resource_name: resource's name
* @resource_size: resource's size
* @resource_id: resource's id
* @parent_resource_id: resource's parent id
* @size_params: size parameters
*
* Generic resources should reuse the same names across drivers.
* Please see the generic resources list at:
* Documentation/networking/devlink/devlink-resource.rst
*/
int devl_resource_register(struct devlink *devlink,
const char *resource_name,
u64 resource_size,
u64 resource_id,
u64 parent_resource_id,
const struct devlink_resource_size_params *size_params)
{
struct devlink_resource *resource;
struct list_head *resource_list;
bool top_hierarchy;
lockdep_assert_held(&devlink->lock);
top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
resource = devlink_resource_find(devlink, NULL, resource_id);
if (resource)
return -EINVAL;
resource = kzalloc(sizeof(*resource), GFP_KERNEL);
if (!resource)
return -ENOMEM;
if (top_hierarchy) {
resource_list = &devlink->resource_list;
} else {
struct devlink_resource *parent_resource;
parent_resource = devlink_resource_find(devlink, NULL,
parent_resource_id);
if (parent_resource) {
resource_list = &parent_resource->resource_list;
resource->parent = parent_resource;
} else {
kfree(resource);
return -EINVAL;
}
}
resource->name = resource_name;
resource->size = resource_size;
resource->size_new = resource_size;
resource->id = resource_id;
resource->size_valid = true;
memcpy(&resource->size_params, size_params,
sizeof(resource->size_params));
INIT_LIST_HEAD(&resource->resource_list);
list_add_tail(&resource->list, resource_list);
return 0;
}
EXPORT_SYMBOL_GPL(devl_resource_register);
/**
* devlink_resource_register - devlink resource register
*
* @devlink: devlink
* @resource_name: resource's name
* @resource_size: resource's size
* @resource_id: resource's id
* @parent_resource_id: resource's parent id
* @size_params: size parameters
*
* Generic resources should reuse the same names across drivers.
* Please see the generic resources list at:
* Documentation/networking/devlink/devlink-resource.rst
*
* Context: Takes and release devlink->lock <mutex>.
*/
int devlink_resource_register(struct devlink *devlink,
const char *resource_name,
u64 resource_size,
u64 resource_id,
u64 parent_resource_id,
const struct devlink_resource_size_params *size_params)
{
int err;
devl_lock(devlink);
err = devl_resource_register(devlink, resource_name, resource_size,
resource_id, parent_resource_id, size_params);
devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_resource_register);
static void devlink_resource_unregister(struct devlink *devlink,
struct devlink_resource *resource)
{
struct devlink_resource *tmp, *child_resource;
list_for_each_entry_safe(child_resource, tmp, &resource->resource_list,
list) {
devlink_resource_unregister(devlink, child_resource);
list_del(&child_resource->list);
kfree(child_resource);
}
}
/**
* devl_resources_unregister - free all resources
*
* @devlink: devlink
*/
void devl_resources_unregister(struct devlink *devlink)
{
struct devlink_resource *tmp, *child_resource;
lockdep_assert_held(&devlink->lock);
list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
list) {
devlink_resource_unregister(devlink, child_resource);
list_del(&child_resource->list);
kfree(child_resource);
}
}
EXPORT_SYMBOL_GPL(devl_resources_unregister);
/**
* devlink_resources_unregister - free all resources
*
* @devlink: devlink
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_resources_unregister(struct devlink *devlink)
{
devl_lock(devlink);
devl_resources_unregister(devlink);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_resources_unregister);
/**
* devl_resource_size_get - get and update size
*
* @devlink: devlink
* @resource_id: the requested resource id
* @p_resource_size: ptr to update
*/
int devl_resource_size_get(struct devlink *devlink,
u64 resource_id,
u64 *p_resource_size)
{
struct devlink_resource *resource;
lockdep_assert_held(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (!resource)
return -EINVAL;
*p_resource_size = resource->size_new;
resource->size = resource->size_new;
return 0;
}
EXPORT_SYMBOL_GPL(devl_resource_size_get);
/**
* devl_dpipe_table_resource_set - set the resource id
*
* @devlink: devlink
* @table_name: table name
* @resource_id: resource id
* @resource_units: number of resource's units consumed per table's entry
*/
int devl_dpipe_table_resource_set(struct devlink *devlink,
const char *table_name, u64 resource_id,
u64 resource_units)
{
struct devlink_dpipe_table *table;
table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
table_name, devlink);
if (!table)
return -EINVAL;
table->resource_id = resource_id;
table->resource_units = resource_units;
table->resource_valid = true;
return 0;
}
EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
/**
* devl_resource_occ_get_register - register occupancy getter
*
* @devlink: devlink
* @resource_id: resource id
* @occ_get: occupancy getter callback
* @occ_get_priv: occupancy getter callback priv
*/
void devl_resource_occ_get_register(struct devlink *devlink,
u64 resource_id,
devlink_resource_occ_get_t *occ_get,
void *occ_get_priv)
{
struct devlink_resource *resource;
lockdep_assert_held(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (WARN_ON(!resource))
return;
WARN_ON(resource->occ_get);
resource->occ_get = occ_get;
resource->occ_get_priv = occ_get_priv;
}
EXPORT_SYMBOL_GPL(devl_resource_occ_get_register);
/**
* devlink_resource_occ_get_register - register occupancy getter
*
* @devlink: devlink
* @resource_id: resource id
* @occ_get: occupancy getter callback
* @occ_get_priv: occupancy getter callback priv
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_resource_occ_get_register(struct devlink *devlink,
u64 resource_id,
devlink_resource_occ_get_t *occ_get,
void *occ_get_priv)
{
devl_lock(devlink);
devl_resource_occ_get_register(devlink, resource_id,
occ_get, occ_get_priv);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
/**
* devl_resource_occ_get_unregister - unregister occupancy getter
*
* @devlink: devlink
* @resource_id: resource id
*/
void devl_resource_occ_get_unregister(struct devlink *devlink,
u64 resource_id)
{
struct devlink_resource *resource;
lockdep_assert_held(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (WARN_ON(!resource))
return;
WARN_ON(!resource->occ_get);
resource->occ_get = NULL;
resource->occ_get_priv = NULL;
}
EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
/**
* devlink_resource_occ_get_unregister - unregister occupancy getter
*
* @devlink: devlink
* @resource_id: resource id
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_resource_occ_get_unregister(struct devlink *devlink,
u64 resource_id)
{
devl_lock(devlink);
devl_resource_occ_get_unregister(devlink, resource_id);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
static int devlink_param_verify(const struct devlink_param *param)
{
if (!param || !param->name || !param->supported_cmodes)
return -EINVAL;
if (param->generic)
return devlink_param_generic_verify(param);
else
return devlink_param_driver_verify(param);
}
static int devlink_param_register(struct devlink *devlink,
const struct devlink_param *param)
{
struct devlink_param_item *param_item;
WARN_ON(devlink_param_verify(param));
WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name));
if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
WARN_ON(param->get || param->set);
else
WARN_ON(!param->get || !param->set);
param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
if (!param_item)
return -ENOMEM;
param_item->param = param;
list_add_tail(&param_item->list, &devlink->param_list);
devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
return 0;
}
static void devlink_param_unregister(struct devlink *devlink,
const struct devlink_param *param)
{
struct devlink_param_item *param_item;
param_item =
devlink_param_find_by_name(&devlink->param_list, param->name);
if (WARN_ON(!param_item))
return;
devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL);
list_del(&param_item->list);
kfree(param_item);
}
/**
* devl_params_register - register configuration parameters
*
* @devlink: devlink
* @params: configuration parameters array
* @params_count: number of parameters provided
*
* Register the configuration parameters supported by the driver.
*/
int devl_params_register(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
const struct devlink_param *param = params;
int i, err;
lockdep_assert_held(&devlink->lock);
for (i = 0; i < params_count; i++, param++) {
err = devlink_param_register(devlink, param);
if (err)
goto rollback;
}
return 0;
rollback:
if (!i)
return err;
for (param--; i > 0; i--, param--)
devlink_param_unregister(devlink, param);
return err;
}
EXPORT_SYMBOL_GPL(devl_params_register);
int devlink_params_register(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
int err;
devl_lock(devlink);
err = devl_params_register(devlink, params, params_count);
devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_params_register);
/**
* devl_params_unregister - unregister configuration parameters
* @devlink: devlink
* @params: configuration parameters to unregister
* @params_count: number of parameters provided
*/
void devl_params_unregister(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
const struct devlink_param *param = params;
int i;
lockdep_assert_held(&devlink->lock);
for (i = 0; i < params_count; i++, param++)
devlink_param_unregister(devlink, param);
}
EXPORT_SYMBOL_GPL(devl_params_unregister);
void devlink_params_unregister(struct devlink *devlink,
const struct devlink_param *params,
size_t params_count)
{
devl_lock(devlink);
devl_params_unregister(devlink, params, params_count);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_params_unregister);
/**
* devl_param_driverinit_value_get - get configuration parameter
* value for driver initializing
*
* @devlink: devlink
* @param_id: parameter ID
* @init_val: value of parameter in driverinit configuration mode
*
* This function should be used by the driver to get driverinit
* configuration for initialization after reload command.
*/
int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
union devlink_param_value *init_val)
{
struct devlink_param_item *param_item;
lockdep_assert_held(&devlink->lock);
if (WARN_ON(!devlink_reload_supported(devlink->ops)))
return -EOPNOTSUPP;
param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
if (!param_item)
return -EINVAL;
if (!param_item->driverinit_value_valid)
return -EOPNOTSUPP;
if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
DEVLINK_PARAM_CMODE_DRIVERINIT)))
return -EOPNOTSUPP;
if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
strcpy(init_val->vstr, param_item->driverinit_value.vstr);
else
*init_val = param_item->driverinit_value;
return 0;
}
EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get);
/**
* devl_param_driverinit_value_set - set value of configuration
* parameter for driverinit
* configuration mode
*
* @devlink: devlink
* @param_id: parameter ID
* @init_val: value of parameter to set for driverinit configuration mode
*
* This function should be used by the driver to set driverinit
* configuration mode default value.
*/
void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
union devlink_param_value init_val)
{
struct devlink_param_item *param_item;
param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
if (WARN_ON(!param_item))
return;
if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
DEVLINK_PARAM_CMODE_DRIVERINIT)))
return;
if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
strcpy(param_item->driverinit_value.vstr, init_val.vstr);
else
param_item->driverinit_value = init_val;
param_item->driverinit_value_valid = true;
devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
}
EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set);
/**
* devl_param_value_changed - notify devlink on a parameter's value
* change. Should be called by the driver
* right after the change.
*
* @devlink: devlink
* @param_id: parameter ID
*
* This function should be used by the driver to notify devlink on value
* change, excluding driverinit configuration mode.
* For driverinit configuration mode driver should use the function
*/
void devl_param_value_changed(struct devlink *devlink, u32 param_id)
{
struct devlink_param_item *param_item;
param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
WARN_ON(!param_item);
devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
}
EXPORT_SYMBOL_GPL(devl_param_value_changed);
/**
* devl_region_create - create a new address region
*
* @devlink: devlink
* @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*/
struct devlink_region *devl_region_create(struct devlink *devlink,
const struct devlink_region_ops *ops,
u32 region_max_snapshots,
u64 region_size)
{
struct devlink_region *region;
devl_assert_locked(devlink);
if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
return ERR_PTR(-EINVAL);
if (devlink_region_get_by_name(devlink, ops->name))
return ERR_PTR(-EEXIST);
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region)
return ERR_PTR(-ENOMEM);
region->devlink = devlink;
region->max_snapshots = region_max_snapshots;
region->ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
mutex_init(&region->snapshot_lock);
list_add_tail(&region->list, &devlink->region_list);
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
return region;
}
EXPORT_SYMBOL_GPL(devl_region_create);
/**
* devlink_region_create - create a new address region
*
* @devlink: devlink
* @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*
* Context: Takes and release devlink->lock <mutex>.
*/
struct devlink_region *
devlink_region_create(struct devlink *devlink,
const struct devlink_region_ops *ops,
u32 region_max_snapshots, u64 region_size)
{
struct devlink_region *region;
devl_lock(devlink);
region = devl_region_create(devlink, ops, region_max_snapshots,
region_size);
devl_unlock(devlink);
return region;
}
EXPORT_SYMBOL_GPL(devlink_region_create);
/**
* devlink_port_region_create - create a new address region for a port
*
* @port: devlink port
* @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*
* Context: Takes and release devlink->lock <mutex>.
*/
struct devlink_region *
devlink_port_region_create(struct devlink_port *port,
const struct devlink_port_region_ops *ops,
u32 region_max_snapshots, u64 region_size)
{
struct devlink *devlink = port->devlink;
struct devlink_region *region;
int err = 0;
ASSERT_DEVLINK_PORT_INITIALIZED(port);
if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
return ERR_PTR(-EINVAL);
devl_lock(devlink);
if (devlink_port_region_get_by_name(port, ops->name)) {
err = -EEXIST;
goto unlock;
}
region = kzalloc(sizeof(*region), GFP_KERNEL);
if (!region) {
err = -ENOMEM;
goto unlock;
}
region->devlink = devlink;
region->port = port;
region->max_snapshots = region_max_snapshots;
region->port_ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
mutex_init(&region->snapshot_lock);
list_add_tail(&region->list, &port->region_list);
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
devl_unlock(devlink);
return region;
unlock:
devl_unlock(devlink);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(devlink_port_region_create);
/**
* devl_region_destroy - destroy address region
*
* @region: devlink region to destroy
*/
void devl_region_destroy(struct devlink_region *region)
{
struct devlink *devlink = region->devlink;
struct devlink_snapshot *snapshot, *ts;
devl_assert_locked(devlink);
/* Free all snapshots of region */
mutex_lock(&region->snapshot_lock);
list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
devlink_region_snapshot_del(region, snapshot);
mutex_unlock(&region->snapshot_lock);
list_del(&region->list);
mutex_destroy(&region->snapshot_lock);
devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
kfree(region);
}
EXPORT_SYMBOL_GPL(devl_region_destroy);
/**
* devlink_region_destroy - destroy address region
*
* @region: devlink region to destroy
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_region_destroy(struct devlink_region *region)
{
struct devlink *devlink = region->devlink;
devl_lock(devlink);
devl_region_destroy(region);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_region_destroy);
/**
* devlink_region_snapshot_id_get - get snapshot ID
*
* This callback should be called when adding a new snapshot,
* Driver should use the same id for multiple snapshots taken
* on multiple regions at the same time/by the same trigger.
*
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
* The caller of this function must use devlink_region_snapshot_id_put
* when finished creating regions using this id.
*
* Returns zero on success, or a negative error code on failure.
*
* @devlink: devlink
* @id: storage to return id
*/
int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
return __devlink_region_snapshot_id_get(devlink, id);
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
devlink: track snapshot id usage count using an xarray Each snapshot created for a devlink region must have an id. These ids are supposed to be unique per "event" that caused the snapshot to be created. Drivers call devlink_region_snapshot_id_get to obtain a new id to use for a new event trigger. The id values are tracked per devlink, so that the same id number can be used if a triggering event creates multiple snapshots on different regions. There is no mechanism for snapshot ids to ever be reused. Introduce an xarray to store the count of how many snapshots are using a given id, replacing the snapshot_id field previously used for picking the next id. The devlink_region_snapshot_id_get() function will use xa_alloc to insert an initial value of 1 value at an available slot between 0 and U32_MAX. The new __devlink_snapshot_id_increment() and __devlink_snapshot_id_decrement() functions will be used to track how many snapshots currently use an id. Drivers must now call devlink_snapshot_id_put() in order to release their reference of the snapshot id after adding region snapshots. By tracking the total number of snapshots using a given id, it is possible for the decrement() function to erase the id from the xarray when it is not in use. With this method, a snapshot id can become reused again once all snapshots that referred to it have been deleted via DEVLINK_CMD_REGION_DEL, and the driver has finished adding snapshots. This work also paves the way to introduce a mechanism for userspace to request a snapshot. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-26 11:37:15 -07:00
/**
* devlink_region_snapshot_id_put - put snapshot ID reference
*
* This should be called by a driver after finishing creating snapshots
* with an id. Doing so ensures that the ID can later be released in the
* event that all snapshots using it have been destroyed.
*
* @devlink: devlink
* @id: id to release reference on
*/
void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
{
__devlink_snapshot_id_decrement(devlink, id);
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
/**
* devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
* from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
*/
int devlink_region_snapshot_create(struct devlink_region *region,
u8 *data, u32 snapshot_id)
{
int err;
mutex_lock(&region->snapshot_lock);
err = __devlink_region_snapshot_create(region, data, snapshot_id);
mutex_unlock(&region->snapshot_lock);
return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
#define DEVLINK_TRAP(_id, _type) \
{ \
.type = DEVLINK_TRAP_TYPE_##_type, \
.id = DEVLINK_TRAP_GENERIC_ID_##_id, \
.name = DEVLINK_TRAP_GENERIC_NAME_##_id, \
}
static const struct devlink_trap devlink_trap_generic[] = {
DEVLINK_TRAP(SMAC_MC, DROP),
DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP),
DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP),
DEVLINK_TRAP(INGRESS_STP_FILTER, DROP),
DEVLINK_TRAP(EMPTY_TX_LIST, DROP),
DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP),
DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP),
DEVLINK_TRAP(TTL_ERROR, EXCEPTION),
DEVLINK_TRAP(TAIL_DROP, DROP),
DEVLINK_TRAP(NON_IP_PACKET, DROP),
DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP),
DEVLINK_TRAP(DIP_LB, DROP),
DEVLINK_TRAP(SIP_MC, DROP),
DEVLINK_TRAP(SIP_LB, DROP),
DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP),
DEVLINK_TRAP(IPV4_SIP_BC, DROP),
DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
DEVLINK_TRAP(MTU_ERROR, EXCEPTION),
DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION),
DEVLINK_TRAP(RPF, EXCEPTION),
DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION),
DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION),
DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION),
DEVLINK_TRAP(NON_ROUTABLE, DROP),
DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
DEVLINK_TRAP(STP, CONTROL),
DEVLINK_TRAP(LACP, CONTROL),
DEVLINK_TRAP(LLDP, CONTROL),
DEVLINK_TRAP(IGMP_QUERY, CONTROL),
DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
DEVLINK_TRAP(MLD_QUERY, CONTROL),
DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
DEVLINK_TRAP(IPV4_DHCP, CONTROL),
DEVLINK_TRAP(IPV6_DHCP, CONTROL),
DEVLINK_TRAP(ARP_REQUEST, CONTROL),
DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
DEVLINK_TRAP(IPV4_BFD, CONTROL),
DEVLINK_TRAP(IPV6_BFD, CONTROL),
DEVLINK_TRAP(IPV4_OSPF, CONTROL),
DEVLINK_TRAP(IPV6_OSPF, CONTROL),
DEVLINK_TRAP(IPV4_BGP, CONTROL),
DEVLINK_TRAP(IPV6_BGP, CONTROL),
DEVLINK_TRAP(IPV4_VRRP, CONTROL),
DEVLINK_TRAP(IPV6_VRRP, CONTROL),
DEVLINK_TRAP(IPV4_PIM, CONTROL),
DEVLINK_TRAP(IPV6_PIM, CONTROL),
DEVLINK_TRAP(UC_LB, CONTROL),
DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
DEVLINK_TRAP(PTP_EVENT, CONTROL),
DEVLINK_TRAP(PTP_GENERAL, CONTROL),
DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
DEVLINK_TRAP(EARLY_DROP, DROP),
DEVLINK_TRAP(VXLAN_PARSING, DROP),
DEVLINK_TRAP(LLC_SNAP_PARSING, DROP),
DEVLINK_TRAP(VLAN_PARSING, DROP),
DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP),
DEVLINK_TRAP(MPLS_PARSING, DROP),
DEVLINK_TRAP(ARP_PARSING, DROP),
DEVLINK_TRAP(IP_1_PARSING, DROP),
DEVLINK_TRAP(IP_N_PARSING, DROP),
DEVLINK_TRAP(GRE_PARSING, DROP),
DEVLINK_TRAP(UDP_PARSING, DROP),
DEVLINK_TRAP(TCP_PARSING, DROP),
DEVLINK_TRAP(IPSEC_PARSING, DROP),
DEVLINK_TRAP(SCTP_PARSING, DROP),
DEVLINK_TRAP(DCCP_PARSING, DROP),
DEVLINK_TRAP(GTP_PARSING, DROP),
DEVLINK_TRAP(ESP_PARSING, DROP),
DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
DEVLINK_TRAP(DMAC_FILTER, DROP),
DEVLINK_TRAP(EAPOL, CONTROL),
DEVLINK_TRAP(LOCKED_PORT, DROP),
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
};
#define DEVLINK_TRAP_GROUP(_id) \
{ \
.id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \
.name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \
}
static const struct devlink_trap_group devlink_trap_group_generic[] = {
DEVLINK_TRAP_GROUP(L2_DROPS),
DEVLINK_TRAP_GROUP(L3_DROPS),
DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
DEVLINK_TRAP_GROUP(BUFFER_DROPS),
DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
DEVLINK_TRAP_GROUP(ACL_DROPS),
DEVLINK_TRAP_GROUP(STP),
DEVLINK_TRAP_GROUP(LACP),
DEVLINK_TRAP_GROUP(LLDP),
DEVLINK_TRAP_GROUP(MC_SNOOPING),
DEVLINK_TRAP_GROUP(DHCP),
DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
DEVLINK_TRAP_GROUP(BFD),
DEVLINK_TRAP_GROUP(OSPF),
DEVLINK_TRAP_GROUP(BGP),
DEVLINK_TRAP_GROUP(VRRP),
DEVLINK_TRAP_GROUP(PIM),
DEVLINK_TRAP_GROUP(UC_LB),
DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY),
DEVLINK_TRAP_GROUP(IPV6),
DEVLINK_TRAP_GROUP(PTP_EVENT),
DEVLINK_TRAP_GROUP(PTP_GENERAL),
DEVLINK_TRAP_GROUP(ACL_SAMPLE),
DEVLINK_TRAP_GROUP(ACL_TRAP),
DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
DEVLINK_TRAP_GROUP(EAPOL),
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
};
static int devlink_trap_generic_verify(const struct devlink_trap *trap)
{
if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX)
return -EINVAL;
if (strcmp(trap->name, devlink_trap_generic[trap->id].name))
return -EINVAL;
if (trap->type != devlink_trap_generic[trap->id].type)
return -EINVAL;
return 0;
}
static int devlink_trap_driver_verify(const struct devlink_trap *trap)
{
int i;
if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) {
if (!strcmp(trap->name, devlink_trap_generic[i].name))
return -EEXIST;
}
return 0;
}
static int devlink_trap_verify(const struct devlink_trap *trap)
{
if (!trap || !trap->name)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
return -EINVAL;
if (trap->generic)
return devlink_trap_generic_verify(trap);
else
return devlink_trap_driver_verify(trap);
}
static int
devlink_trap_group_generic_verify(const struct devlink_trap_group *group)
{
if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
return -EINVAL;
if (strcmp(group->name, devlink_trap_group_generic[group->id].name))
return -EINVAL;
return 0;
}
static int
devlink_trap_group_driver_verify(const struct devlink_trap_group *group)
{
int i;
if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) {
if (!strcmp(group->name, devlink_trap_group_generic[i].name))
return -EEXIST;
}
return 0;
}
static int devlink_trap_group_verify(const struct devlink_trap_group *group)
{
if (group->generic)
return devlink_trap_group_generic_verify(group);
else
return devlink_trap_group_driver_verify(group);
}
static void
devlink_trap_group_notify(struct devlink *devlink,
const struct devlink_trap_group_item *group_item,
enum devlink_command cmd)
{
struct sk_buff *msg;
int err;
WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0,
0);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int
devlink_trap_item_group_link(struct devlink *devlink,
struct devlink_trap_item *trap_item)
{
u16 group_id = trap_item->trap->init_group_id;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
struct devlink_trap_group_item *group_item;
group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
if (WARN_ON_ONCE(!group_item))
return -EINVAL;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
trap_item->group_item = group_item;
return 0;
}
static void devlink_trap_notify(struct devlink *devlink,
const struct devlink_trap_item *trap_item,
enum devlink_command cmd)
{
struct sk_buff *msg;
int err;
WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
cmd != DEVLINK_CMD_TRAP_DEL);
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int
devlink_trap_register(struct devlink *devlink,
const struct devlink_trap *trap, void *priv)
{
struct devlink_trap_item *trap_item;
int err;
if (devlink_trap_item_lookup(devlink, trap->name))
return -EEXIST;
trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL);
if (!trap_item)
return -ENOMEM;
trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
if (!trap_item->stats) {
err = -ENOMEM;
goto err_stats_alloc;
}
trap_item->trap = trap;
trap_item->action = trap->init_action;
trap_item->priv = priv;
err = devlink_trap_item_group_link(devlink, trap_item);
if (err)
goto err_group_link;
err = devlink->ops->trap_init(devlink, trap, trap_item);
if (err)
goto err_trap_init;
list_add_tail(&trap_item->list, &devlink->trap_list);
devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
return 0;
err_trap_init:
err_group_link:
free_percpu(trap_item->stats);
err_stats_alloc:
kfree(trap_item);
return err;
}
static void devlink_trap_unregister(struct devlink *devlink,
const struct devlink_trap *trap)
{
struct devlink_trap_item *trap_item;
trap_item = devlink_trap_item_lookup(devlink, trap->name);
if (WARN_ON_ONCE(!trap_item))
return;
devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
list_del(&trap_item->list);
if (devlink->ops->trap_fini)
devlink->ops->trap_fini(devlink, trap, trap_item);
free_percpu(trap_item->stats);
kfree(trap_item);
}
static void devlink_trap_disable(struct devlink *devlink,
const struct devlink_trap *trap)
{
struct devlink_trap_item *trap_item;
trap_item = devlink_trap_item_lookup(devlink, trap->name);
if (WARN_ON_ONCE(!trap_item))
return;
devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP,
NULL);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
trap_item->action = DEVLINK_TRAP_ACTION_DROP;
}
/**
* devl_traps_register - Register packet traps with devlink.
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
* @priv: Driver private information.
*
* Return: Non-zero value on failure.
*/
int devl_traps_register(struct devlink *devlink,
const struct devlink_trap *traps,
size_t traps_count, void *priv)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
int i, err;
if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
return -EINVAL;
devl_assert_locked(devlink);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
for (i = 0; i < traps_count; i++) {
const struct devlink_trap *trap = &traps[i];
err = devlink_trap_verify(trap);
if (err)
goto err_trap_verify;
err = devlink_trap_register(devlink, trap, priv);
if (err)
goto err_trap_register;
}
return 0;
err_trap_register:
err_trap_verify:
for (i--; i >= 0; i--)
devlink_trap_unregister(devlink, &traps[i]);
return err;
}
EXPORT_SYMBOL_GPL(devl_traps_register);
/**
* devlink_traps_register - Register packet traps with devlink.
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
* @priv: Driver private information.
*
* Context: Takes and release devlink->lock <mutex>.
*
* Return: Non-zero value on failure.
*/
int devlink_traps_register(struct devlink *devlink,
const struct devlink_trap *traps,
size_t traps_count, void *priv)
{
int err;
devl_lock(devlink);
err = devl_traps_register(devlink, traps, traps_count, priv);
devl_unlock(devlink);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
return err;
}
EXPORT_SYMBOL_GPL(devlink_traps_register);
/**
* devl_traps_unregister - Unregister packet traps from devlink.
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
*/
void devl_traps_unregister(struct devlink *devlink,
const struct devlink_trap *traps,
size_t traps_count)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
int i;
devl_assert_locked(devlink);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
/* Make sure we do not have any packets in-flight while unregistering
* traps by disabling all of them and waiting for a grace period.
*/
for (i = traps_count - 1; i >= 0; i--)
devlink_trap_disable(devlink, &traps[i]);
synchronize_rcu();
for (i = traps_count - 1; i >= 0; i--)
devlink_trap_unregister(devlink, &traps[i]);
}
EXPORT_SYMBOL_GPL(devl_traps_unregister);
/**
* devlink_traps_unregister - Unregister packet traps from devlink.
* @devlink: devlink.
* @traps: Packet traps.
* @traps_count: Count of provided packet traps.
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_traps_unregister(struct devlink *devlink,
const struct devlink_trap *traps,
size_t traps_count)
{
devl_lock(devlink);
devl_traps_unregister(devlink, traps, traps_count);
devl_unlock(devlink);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
EXPORT_SYMBOL_GPL(devlink_traps_unregister);
static void
devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
size_t skb_len)
{
struct devlink_stats *stats;
stats = this_cpu_ptr(trap_stats);
u64_stats_update_begin(&stats->syncp);
u64_stats_add(&stats->rx_bytes, skb_len);
u64_stats_inc(&stats->rx_packets);
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
u64_stats_update_end(&stats->syncp);
}
static void
devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
const struct devlink_trap_item *trap_item,
struct devlink_port *in_devlink_port,
const struct flow_action_cookie *fa_cookie)
{
metadata->trap_name = trap_item->trap->name;
metadata->trap_group_name = trap_item->group_item->group->name;
metadata->fa_cookie = fa_cookie;
metadata->trap_type = trap_item->trap->type;
spin_lock(&in_devlink_port->type_lock);
if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
metadata->input_dev = in_devlink_port->type_eth.netdev;
spin_unlock(&in_devlink_port->type_lock);
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
/**
* devlink_trap_report - Report trapped packet to drop monitor.
* @devlink: devlink.
* @skb: Trapped packet.
* @trap_ctx: Trap context.
* @in_devlink_port: Input devlink port.
* @fa_cookie: Flow action cookie. Could be NULL.
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
*/
void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
void *trap_ctx, struct devlink_port *in_devlink_port,
const struct flow_action_cookie *fa_cookie)
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
{
struct devlink_trap_item *trap_item = trap_ctx;
devlink_trap_stats_update(trap_item->stats, skb->len);
devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
if (trace_devlink_trap_report_enabled()) {
struct devlink_trap_metadata metadata = {};
devlink_trap_report_metadata_set(&metadata, trap_item,
in_devlink_port, fa_cookie);
trace_devlink_trap_report(devlink, skb, &metadata);
}
devlink: Add packet trap infrastructure Add the basic packet trap infrastructure that allows device drivers to register their supported packet traps and trap groups with devlink. Each driver is expected to provide basic information about each supported trap, such as name and ID, but also the supported metadata types that will accompany each packet trapped via the trap. The currently supported metadata type is just the input port, but more will be added in the future. For example, output port and traffic class. Trap groups allow users to set the action of all member traps. In addition, users can retrieve per-group statistics in case per-trap statistics are too narrow. In the future, the trap group object can be extended with more attributes, such as policer settings which will limit the amount of traffic generated by member traps towards the CPU. Beside registering their packet traps with devlink, drivers are also expected to report trapped packets to devlink along with relevant metadata. devlink will maintain packets and bytes statistics for each packet trap and will potentially report the trapped packet with its metadata to user space via drop monitor netlink channel. The interface towards the drivers is simple and allows devlink to set the action of the trap. Currently, only two actions are supported: 'trap' and 'drop'. When set to 'trap', the device is expected to provide the sole copy of the packet to the driver which will pass it to devlink. When set to 'drop', the device is expected to drop the packet and not send a copy to the driver. In the future, more actions can be added, such as 'mirror'. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-08-17 16:28:17 +03:00
}
EXPORT_SYMBOL_GPL(devlink_trap_report);
/**
* devlink_trap_ctx_priv - Trap context to driver private information.
* @trap_ctx: Trap context.
*
* Return: Driver private information passed during registration.
*/
void *devlink_trap_ctx_priv(void *trap_ctx)
{
struct devlink_trap_item *trap_item = trap_ctx;
return trap_item->priv;
}
EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
static int
devlink_trap_group_item_policer_link(struct devlink *devlink,
struct devlink_trap_group_item *group_item)
{
u32 policer_id = group_item->group->init_policer_id;
struct devlink_trap_policer_item *policer_item;
if (policer_id == 0)
return 0;
policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
if (WARN_ON_ONCE(!policer_item))
return -EINVAL;
group_item->policer_item = policer_item;
return 0;
}
static int
devlink_trap_group_register(struct devlink *devlink,
const struct devlink_trap_group *group)
{
struct devlink_trap_group_item *group_item;
int err;
if (devlink_trap_group_item_lookup(devlink, group->name))
return -EEXIST;
group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
if (!group_item)
return -ENOMEM;
group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
if (!group_item->stats) {
err = -ENOMEM;
goto err_stats_alloc;
}
group_item->group = group;
err = devlink_trap_group_item_policer_link(devlink, group_item);
if (err)
goto err_policer_link;
if (devlink->ops->trap_group_init) {
err = devlink->ops->trap_group_init(devlink, group);
if (err)
goto err_group_init;
}
list_add_tail(&group_item->list, &devlink->trap_group_list);
devlink_trap_group_notify(devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_NEW);
return 0;
err_group_init:
err_policer_link:
free_percpu(group_item->stats);
err_stats_alloc:
kfree(group_item);
return err;
}
static void
devlink_trap_group_unregister(struct devlink *devlink,
const struct devlink_trap_group *group)
{
struct devlink_trap_group_item *group_item;
group_item = devlink_trap_group_item_lookup(devlink, group->name);
if (WARN_ON_ONCE(!group_item))
return;
devlink_trap_group_notify(devlink, group_item,
DEVLINK_CMD_TRAP_GROUP_DEL);
list_del(&group_item->list);
free_percpu(group_item->stats);
kfree(group_item);
}
/**
* devl_trap_groups_register - Register packet trap groups with devlink.
* @devlink: devlink.
* @groups: Packet trap groups.
* @groups_count: Count of provided packet trap groups.
*
* Return: Non-zero value on failure.
*/
int devl_trap_groups_register(struct devlink *devlink,
const struct devlink_trap_group *groups,
size_t groups_count)
{
int i, err;
devl_assert_locked(devlink);
for (i = 0; i < groups_count; i++) {
const struct devlink_trap_group *group = &groups[i];
err = devlink_trap_group_verify(group);
if (err)
goto err_trap_group_verify;
err = devlink_trap_group_register(devlink, group);
if (err)
goto err_trap_group_register;
}
return 0;
err_trap_group_register:
err_trap_group_verify:
for (i--; i >= 0; i--)
devlink_trap_group_unregister(devlink, &groups[i]);
return err;
}
EXPORT_SYMBOL_GPL(devl_trap_groups_register);
/**
* devlink_trap_groups_register - Register packet trap groups with devlink.
* @devlink: devlink.
* @groups: Packet trap groups.
* @groups_count: Count of provided packet trap groups.
*
* Context: Takes and release devlink->lock <mutex>.
*
* Return: Non-zero value on failure.
*/
int devlink_trap_groups_register(struct devlink *devlink,
const struct devlink_trap_group *groups,
size_t groups_count)
{
int err;
devl_lock(devlink);
err = devl_trap_groups_register(devlink, groups, groups_count);
devl_unlock(devlink);
return err;
}
EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
/**
* devl_trap_groups_unregister - Unregister packet trap groups from devlink.
* @devlink: devlink.
* @groups: Packet trap groups.
* @groups_count: Count of provided packet trap groups.
*/
void devl_trap_groups_unregister(struct devlink *devlink,
const struct devlink_trap_group *groups,
size_t groups_count)
{
int i;
devl_assert_locked(devlink);
for (i = groups_count - 1; i >= 0; i--)
devlink_trap_group_unregister(devlink, &groups[i]);
}
EXPORT_SYMBOL_GPL(devl_trap_groups_unregister);
/**
* devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
* @devlink: devlink.
* @groups: Packet trap groups.
* @groups_count: Count of provided packet trap groups.
*
* Context: Takes and release devlink->lock <mutex>.
*/
void devlink_trap_groups_unregister(struct devlink *devlink,
const struct devlink_trap_group *groups,
size_t groups_count)
{
devl_lock(devlink);
devl_trap_groups_unregister(devlink, groups, groups_count);
devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
static void
devlink_trap_policer_notify(struct devlink *devlink,
const struct devlink_trap_policer_item *policer_item,
enum devlink_command cmd)
{
struct sk_buff *msg;
int err;
WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
return;
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return;
err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
0, 0);
if (err) {
nlmsg_free(msg);
return;
}
genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
}
static int
devlink_trap_policer_register(struct devlink *devlink,
const struct devlink_trap_policer *policer)
{
struct devlink_trap_policer_item *policer_item;
int err;
if (devlink_trap_policer_item_lookup(devlink, policer->id))
return -EEXIST;
policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
if (!policer_item)
return -ENOMEM;
policer_item->policer = policer;
policer_item->rate = policer->init_rate;
policer_item->burst = policer->init_burst;
if (devlink->ops->trap_policer_init) {
err = devlink->ops->trap_policer_init(devlink, policer);
if (err)
goto err_policer_init;
}
list_add_tail(&policer_item->list, &devlink->trap_policer_list);
devlink_trap_policer_notify(devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_NEW);
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
return 0;
err_policer_init:
kfree(policer_item);
return err;
}
static void
devlink_trap_policer_unregister(struct devlink *devlink,
const struct devlink_trap_policer *policer)
{
struct devlink_trap_policer_item *policer_item;
policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
if (WARN_ON_ONCE(!policer_item))
return;
devlink_trap_policer_notify(devlink, policer_item,
DEVLINK_CMD_TRAP_POLICER_DEL);
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
list_del(&policer_item->list);
if (devlink->ops->trap_policer_fini)
devlink->ops->trap_policer_fini(devlink, policer);
kfree(policer_item);
}
/**
* devl_trap_policers_register - Register packet trap policers with devlink.
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
* @devlink: devlink.
* @policers: Packet trap policers.
* @policers_count: Count of provided packet trap policers.
*
* Return: Non-zero value on failure.
*/
int
devl_trap_policers_register(struct devlink *devlink,
const struct devlink_trap_policer *policers,
size_t policers_count)
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
{
int i, err;
devl_assert_locked(devlink);
devlink: Add packet trap policers support Devices capable of offloading the kernel's datapath and perform functions such as bridging and routing must also be able to send (trap) specific packets to the kernel (i.e., the CPU) for processing. For example, a device acting as a multicast-aware bridge must be able to trap IGMP membership reports to the kernel for processing by the bridge module. In most cases, the underlying device is capable of handling packet rates that are several orders of magnitude higher compared to those that can be handled by the CPU. Therefore, in order to prevent the underlying device from overwhelming the CPU, devices usually include packet trap policers that are able to police the trapped packets to rates that can be handled by the CPU. This patch allows capable device drivers to register their supported packet trap policers with devlink. User space can then tune the parameters of these policer (currently, rate and burst size) and read from the device the number of packets that were dropped by the policer, if supported. Subsequent patches in the series will allow device drivers to create default binding between these policers and packet trap groups and allow user space to change the binding. v2: * Add 'strict_start_type' in devlink policy * Have device drivers provide max/min rate/burst size for each policer. Use them to check validity of user provided parameters Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reviewed-by: Jiri Pirko <jiri@mellanox.com> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-03-30 22:38:18 +03:00
for (i = 0; i < policers_count; i++) {
const struct devlink_trap_policer *policer = &policers[i];
if (WARN_ON(policer->id == 0 ||
policer->max_rate < policer->min_rate ||
policer->max_burst < policer->min_burst)) {
err = -EINVAL;
goto err_trap_policer_verify;
}
err = devlink_trap_policer_register(devlink, policer);
if (err)
goto err_trap_policer_register;
}
return 0;
err_trap_policer_register:
err_trap_policer_verify:
for (i--; i >= 0; i--)
devlink_trap_policer_unregister(devlink, &policers[i]);
return err;
}
EXPORT_SYMBOL_GPL(devl_trap_policers_register);
/**
* devl_trap_policers_unregister - Unregister packet trap policers from devlink.
* @devlink: devlink.
* @policers: Packet trap policers.
* @policers_count: Count of provided packet trap policers.
*/
void
devl_trap_policers_unregister(struct devlink *devlink,
const struct devlink_trap_policer *policers,
size_t policers_count)
{
int i;
devl_assert_locked(devlink);
for (i = policers_count - 1; i >= 0; i--)
devlink_trap_policer_unregister(devlink, &policers[i]);
}
EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
int devlink_compat_phys_port_name_get(struct net_device *dev,
char *name, size_t len)
{
struct devlink_port *devlink_port;
/* RTNL mutex is held here which ensures that devlink_port
* instance cannot disappear in the middle. No need to take
* any devlink lock as only permanent values are accessed.
*/
ASSERT_RTNL();
devlink_port = dev->devlink_port;
if (!devlink_port)
return -EOPNOTSUPP;
return __devlink_port_phys_port_name_get(devlink_port, name, len);
}
int devlink_compat_switch_id_get(struct net_device *dev,
struct netdev_phys_item_id *ppid)
{
struct devlink_port *devlink_port;
/* Caller must hold RTNL mutex or reference to dev, which ensures that
* devlink_port instance cannot disappear in the middle. No need to take
* any devlink lock as only permanent values are accessed.
*/
devlink_port = dev->devlink_port;
if (!devlink_port || !devlink_port->switch_port)
return -EOPNOTSUPP;
memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid));
return 0;
}