2017-11-01 15:09:13 +01:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
|
2012-10-13 10:46:48 +01:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions of the Internet Protocol.
|
|
|
|
*
|
|
|
|
* Version: @(#)in.h 1.0.1 04/21/93
|
|
|
|
*
|
|
|
|
* Authors: Original taken from the GNU Project <netinet/in.h> file.
|
|
|
|
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
#ifndef _UAPI_LINUX_IN_H
|
|
|
|
#define _UAPI_LINUX_IN_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2022-11-02 11:25:16 -07:00
|
|
|
#include <linux/stddef.h>
|
2015-06-29 14:57:48 -10:00
|
|
|
#include <linux/libc-compat.h>
|
2012-10-13 10:46:48 +01:00
|
|
|
#include <linux/socket.h>
|
|
|
|
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_IN_IPPROTO
|
2012-10-13 10:46:48 +01:00
|
|
|
/* Standard well-defined IP protocols. */
|
|
|
|
enum {
|
|
|
|
IPPROTO_IP = 0, /* Dummy protocol for TCP */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_IP IPPROTO_IP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_ICMP IPPROTO_ICMP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_IGMP IPPROTO_IGMP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_IPIP IPPROTO_IPIP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_TCP = 6, /* Transmission Control Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_TCP IPPROTO_TCP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_EGP IPPROTO_EGP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_PUP = 12, /* PUP protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_PUP IPPROTO_PUP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_UDP = 17, /* User Datagram Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_UDP IPPROTO_UDP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_IDP = 22, /* XNS IDP protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_IDP IPPROTO_IDP
|
|
|
|
IPPROTO_TP = 29, /* SO Transport Protocol Class 4 */
|
|
|
|
#define IPPROTO_TP IPPROTO_TP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_DCCP IPPROTO_DCCP
|
|
|
|
IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
|
|
|
|
#define IPPROTO_IPV6 IPPROTO_IPV6
|
|
|
|
IPPROTO_RSVP = 46, /* RSVP Protocol */
|
|
|
|
#define IPPROTO_RSVP IPPROTO_RSVP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_GRE IPPROTO_GRE
|
|
|
|
IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
|
|
|
|
#define IPPROTO_ESP IPPROTO_ESP
|
|
|
|
IPPROTO_AH = 51, /* Authentication Header protocol */
|
|
|
|
#define IPPROTO_AH IPPROTO_AH
|
|
|
|
IPPROTO_MTP = 92, /* Multicast Transport Protocol */
|
|
|
|
#define IPPROTO_MTP IPPROTO_MTP
|
|
|
|
IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
|
|
|
|
#define IPPROTO_BEETPH IPPROTO_BEETPH
|
|
|
|
IPPROTO_ENCAP = 98, /* Encapsulation Header */
|
|
|
|
#define IPPROTO_ENCAP IPPROTO_ENCAP
|
|
|
|
IPPROTO_PIM = 103, /* Protocol Independent Multicast */
|
|
|
|
#define IPPROTO_PIM IPPROTO_PIM
|
|
|
|
IPPROTO_COMP = 108, /* Compression Header Protocol */
|
|
|
|
#define IPPROTO_COMP IPPROTO_COMP
|
2022-09-08 10:16:40 -07:00
|
|
|
IPPROTO_L2TP = 115, /* Layer 2 Tunnelling Protocol */
|
|
|
|
#define IPPROTO_L2TP IPPROTO_L2TP
|
2013-08-15 17:28:10 +08:00
|
|
|
IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
|
|
|
|
#define IPPROTO_SCTP IPPROTO_SCTP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
|
2013-08-15 17:28:10 +08:00
|
|
|
#define IPPROTO_UDPLITE IPPROTO_UDPLITE
|
2015-06-04 09:16:37 -07:00
|
|
|
IPPROTO_MPLS = 137, /* MPLS in IP (RFC 4023) */
|
|
|
|
#define IPPROTO_MPLS IPPROTO_MPLS
|
2020-03-11 17:54:06 +01:00
|
|
|
IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */
|
|
|
|
#define IPPROTO_ETHERNET IPPROTO_ETHERNET
|
2013-08-15 17:28:10 +08:00
|
|
|
IPPROTO_RAW = 255, /* Raw IP packets */
|
|
|
|
#define IPPROTO_RAW IPPROTO_RAW
|
2020-01-09 07:59:16 -08:00
|
|
|
IPPROTO_MPTCP = 262, /* Multipath TCP connection */
|
|
|
|
#define IPPROTO_MPTCP IPPROTO_MPTCP
|
2012-10-13 10:46:48 +01:00
|
|
|
IPPROTO_MAX
|
|
|
|
};
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_IN_ADDR
|
2012-10-13 10:46:48 +01:00
|
|
|
/* Internet address. */
|
|
|
|
struct in_addr {
|
|
|
|
__be32 s_addr;
|
|
|
|
};
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
#define IP_TOS 1
|
|
|
|
#define IP_TTL 2
|
|
|
|
#define IP_HDRINCL 3
|
|
|
|
#define IP_OPTIONS 4
|
|
|
|
#define IP_ROUTER_ALERT 5
|
|
|
|
#define IP_RECVOPTS 6
|
|
|
|
#define IP_RETOPTS 7
|
|
|
|
#define IP_PKTINFO 8
|
|
|
|
#define IP_PKTOPTIONS 9
|
|
|
|
#define IP_MTU_DISCOVER 10
|
|
|
|
#define IP_RECVERR 11
|
|
|
|
#define IP_RECVTTL 12
|
|
|
|
#define IP_RECVTOS 13
|
|
|
|
#define IP_MTU 14
|
|
|
|
#define IP_FREEBIND 15
|
|
|
|
#define IP_IPSEC_POLICY 16
|
|
|
|
#define IP_XFRM_POLICY 17
|
|
|
|
#define IP_PASSSEC 18
|
|
|
|
#define IP_TRANSPARENT 19
|
|
|
|
|
|
|
|
/* BSD compatibility */
|
|
|
|
#define IP_RECVRETOPTS IP_RETOPTS
|
|
|
|
|
|
|
|
/* TProxy original addresses */
|
|
|
|
#define IP_ORIGDSTADDR 20
|
|
|
|
#define IP_RECVORIGDSTADDR IP_ORIGDSTADDR
|
|
|
|
|
|
|
|
#define IP_MINTTL 21
|
|
|
|
#define IP_NODEFRAG 22
|
2015-01-05 13:56:17 -08:00
|
|
|
#define IP_CHECKSUM 23
|
inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations
When an application needs to force a source IP on an active TCP socket
it has to use bind(IP, port=x).
As most applications do not want to deal with already used ports, x is
often set to 0, meaning the kernel is in charge to find an available
port.
But kernel does not know yet if this socket is going to be a listener or
be connected.
It has very limited choices (no full knowledge of final 4-tuple for a
connect())
With limited ephemeral port range (about 32K ports), it is very easy to
fill the space.
This patch adds a new SOL_IP socket option, asking kernel to ignore
the 0 port provided by application in bind(IP, port=0) and only
remember the given IP address.
The port will be automatically chosen at connect() time, in a way
that allows sharing a source port as long as the 4-tuples are unique.
This new feature is available for both IPv4 and IPv6 (Thanks Neal)
Tested:
Wrote a test program and checked its behavior on IPv4 and IPv6.
strace(1) shows sequences of bind(IP=127.0.0.2, port=0) followed by
connect().
Also getsockname() show that the port is still 0 right after bind()
but properly allocated after connect().
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 5
setsockopt(5, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
connect(5, {sa_family=AF_INET, sin_port=htons(53174), sin_addr=inet_addr("127.0.0.3")}, 16) = 0
getsockname(5, {sa_family=AF_INET, sin_port=htons(38050), sin_addr=inet_addr("127.0.0.2")}, [16]) = 0
IPv6 test :
socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 7
setsockopt(7, SOL_IP, IP_BIND_ADDRESS_NO_PORT, [1], 4) = 0
bind(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
connect(7, {sa_family=AF_INET6, sin6_port=htons(57300), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 0
getsockname(7, {sa_family=AF_INET6, sin6_port=htons(60964), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, [28]) = 0
I was able to bind()/connect() a million concurrent IPv4 sockets,
instead of ~32000 before patch.
lpaa23:~# ulimit -n 1000010
lpaa23:~# ./bind --connect --num-flows=1000000 &
1000000 sockets
lpaa23:~# grep TCP /proc/net/sockstat
TCP: inuse 2000063 orphan 0 tw 47 alloc 2000157 mem 66
Check that a given source port is indeed used by many different
connections :
lpaa23:~# ss -t src :40000 | head -10
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 127.0.0.2:40000 127.0.202.33:44983
ESTAB 0 0 127.0.0.2:40000 127.2.27.240:44983
ESTAB 0 0 127.0.0.2:40000 127.2.98.5:44983
ESTAB 0 0 127.0.0.2:40000 127.0.124.196:44983
ESTAB 0 0 127.0.0.2:40000 127.2.139.38:44983
ESTAB 0 0 127.0.0.2:40000 127.1.59.80:44983
ESTAB 0 0 127.0.0.2:40000 127.3.6.228:44983
ESTAB 0 0 127.0.0.2:40000 127.0.38.53:44983
ESTAB 0 0 127.0.0.2:40000 127.1.197.10:44983
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-06-06 21:17:57 -07:00
|
|
|
#define IP_BIND_ADDRESS_NO_PORT 24
|
2016-11-02 11:02:16 -04:00
|
|
|
#define IP_RECVFRAGSIZE 25
|
icmp: support rfc 4884
Add setsockopt SOL_IP/IP_RECVERR_4884 to return the offset to an
extension struct if present.
ICMP messages may include an extension structure after the original
datagram. RFC 4884 standardized this behavior. It stores the offset
in words to the extension header in u8 icmphdr.un.reserved[1].
The field is valid only for ICMP types destination unreachable, time
exceeded and parameter problem, if length is at least 128 bytes and
entire packet does not exceed 576 bytes.
Return the offset to the start of the extension struct when reading an
ICMP error from the error queue, if it matches the above constraints.
Do not return the raw u8 field. Return the offset from the start of
the user buffer, in bytes. The kernel does not return the network and
transport headers, so subtract those.
Also validate the headers. Return the offset regardless of validation,
as an invalid extension must still not be misinterpreted as part of
the original datagram. Note that !invalid does not imply valid. If
the extension version does not match, no validation can take place,
for instance.
For backward compatibility, make this optional, set by setsockopt
SOL_IP/IP_RECVERR_RFC4884. For API example and feature test, see
github.com/wdebruij/kerneltools/blob/master/tests/recv_icmp_v2.c
For forward compatibility, reserve only setsockopt value 1, leaving
other bits for additional icmp extensions.
Changes
v1->v2:
- convert word offset to byte offset from start of user buffer
- return in ee_data as u8 may be insufficient
- define extension struct and object header structs
- return len only if constraints met
- if returning len, also validate
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-07-10 09:29:02 -04:00
|
|
|
#define IP_RECVERR_RFC4884 26
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
/* IP_MTU_DISCOVER values */
|
|
|
|
#define IP_PMTUDISC_DONT 0 /* Never send DF frames */
|
|
|
|
#define IP_PMTUDISC_WANT 1 /* Use per route hints */
|
|
|
|
#define IP_PMTUDISC_DO 2 /* Always DF */
|
|
|
|
#define IP_PMTUDISC_PROBE 3 /* Ignore dst pmtu */
|
2013-11-05 02:24:17 +01:00
|
|
|
/* Always use interface mtu (ignores dst pmtu) but don't set DF flag.
|
|
|
|
* Also incoming ICMP frag_needed notifications will be ignored on
|
|
|
|
* this socket to prevent accepting spoofed ones.
|
|
|
|
*/
|
|
|
|
#define IP_PMTUDISC_INTERFACE 4
|
2020-08-27 07:27:49 -04:00
|
|
|
/* weaker version of IP_PMTUDISC_INTERFACE, which allows packets to get
|
2014-02-26 01:20:42 +01:00
|
|
|
* fragmented if they exeed the interface mtu
|
|
|
|
*/
|
|
|
|
#define IP_PMTUDISC_OMIT 5
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
#define IP_MULTICAST_IF 32
|
|
|
|
#define IP_MULTICAST_TTL 33
|
|
|
|
#define IP_MULTICAST_LOOP 34
|
|
|
|
#define IP_ADD_MEMBERSHIP 35
|
|
|
|
#define IP_DROP_MEMBERSHIP 36
|
|
|
|
#define IP_UNBLOCK_SOURCE 37
|
|
|
|
#define IP_BLOCK_SOURCE 38
|
|
|
|
#define IP_ADD_SOURCE_MEMBERSHIP 39
|
|
|
|
#define IP_DROP_SOURCE_MEMBERSHIP 40
|
|
|
|
#define IP_MSFILTER 41
|
|
|
|
#define MCAST_JOIN_GROUP 42
|
|
|
|
#define MCAST_BLOCK_SOURCE 43
|
|
|
|
#define MCAST_UNBLOCK_SOURCE 44
|
|
|
|
#define MCAST_LEAVE_GROUP 45
|
|
|
|
#define MCAST_JOIN_SOURCE_GROUP 46
|
|
|
|
#define MCAST_LEAVE_SOURCE_GROUP 47
|
|
|
|
#define MCAST_MSFILTER 48
|
|
|
|
#define IP_MULTICAST_ALL 49
|
|
|
|
#define IP_UNICAST_IF 50
|
inet: Add IP_LOCAL_PORT_RANGE socket option
Users who want to share a single public IP address for outgoing connections
between several hosts traditionally reach for SNAT. However, SNAT requires
state keeping on the node(s) performing the NAT.
A stateless alternative exists, where a single IP address used for egress
can be shared between several hosts by partitioning the available ephemeral
port range. In such a setup:
1. Each host gets assigned a disjoint range of ephemeral ports.
2. Applications open connections from the host-assigned port range.
3. Return traffic gets routed to the host based on both, the destination IP
and the destination port.
An application which wants to open an outgoing connection (connect) from a
given port range today can choose between two solutions:
1. Manually pick the source port by bind()'ing to it before connect()'ing
the socket.
This approach has a couple of downsides:
a) Search for a free port has to be implemented in the user-space. If
the chosen 4-tuple happens to be busy, the application needs to retry
from a different local port number.
Detecting if 4-tuple is busy can be either easy (TCP) or hard
(UDP). In TCP case, the application simply has to check if connect()
returned an error (EADDRNOTAVAIL). That is assuming that the local
port sharing was enabled (REUSEADDR) by all the sockets.
# Assume desired local port range is 60_000-60_511
s = socket(AF_INET, SOCK_STREAM)
s.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
s.bind(("192.0.2.1", 60_000))
s.connect(("1.1.1.1", 53))
# Fails only if 192.0.2.1:60000 -> 1.1.1.1:53 is busy
# Application must retry with another local port
In case of UDP, the network stack allows binding more than one socket
to the same 4-tuple, when local port sharing is enabled
(REUSEADDR). Hence detecting the conflict is much harder and involves
querying sock_diag and toggling the REUSEADDR flag [1].
b) For TCP, bind()-ing to a port within the ephemeral port range means
that no connecting sockets, that is those which leave it to the
network stack to find a free local port at connect() time, can use
the this port.
IOW, the bind hash bucket tb->fastreuse will be 0 or 1, and the port
will be skipped during the free port search at connect() time.
2. Isolate the app in a dedicated netns and use the use the per-netns
ip_local_port_range sysctl to adjust the ephemeral port range bounds.
The per-netns setting affects all sockets, so this approach can be used
only if:
- there is just one egress IP address, or
- the desired egress port range is the same for all egress IP addresses
used by the application.
For TCP, this approach avoids the downsides of (1). Free port search and
4-tuple conflict detection is done by the network stack:
system("sysctl -w net.ipv4.ip_local_port_range='60000 60511'")
s = socket(AF_INET, SOCK_STREAM)
s.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1)
s.bind(("192.0.2.1", 0))
s.connect(("1.1.1.1", 53))
# Fails if all 4-tuples 192.0.2.1:60000-60511 -> 1.1.1.1:53 are busy
For UDP this approach has limited applicability. Setting the
IP_BIND_ADDRESS_NO_PORT socket option does not result in local source
port being shared with other connected UDP sockets.
Hence relying on the network stack to find a free source port, limits the
number of outgoing UDP flows from a single IP address down to the number
of available ephemeral ports.
To put it another way, partitioning the ephemeral port range between hosts
using the existing Linux networking API is cumbersome.
To address this use case, add a new socket option at the SOL_IP level,
named IP_LOCAL_PORT_RANGE. The new option can be used to clamp down the
ephemeral port range for each socket individually.
The option can be used only to narrow down the per-netns local port
range. If the per-socket range lies outside of the per-netns range, the
latter takes precedence.
UAPI-wise, the low and high range bounds are passed to the kernel as a pair
of u16 values in host byte order packed into a u32. This avoids pointer
passing.
PORT_LO = 40_000
PORT_HI = 40_511
s = socket(AF_INET, SOCK_STREAM)
v = struct.pack("I", PORT_HI << 16 | PORT_LO)
s.setsockopt(SOL_IP, IP_LOCAL_PORT_RANGE, v)
s.bind(("127.0.0.1", 0))
s.getsockname()
# Local address between ("127.0.0.1", 40_000) and ("127.0.0.1", 40_511),
# if there is a free port. EADDRINUSE otherwise.
[1] https://github.com/cloudflare/cloudflare-blog/blob/232b432c1d57/2022-02-connectx/connectx.py#L116
Reviewed-by: Marek Majkowski <marek@cloudflare.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-01-24 14:36:43 +01:00
|
|
|
#define IP_LOCAL_PORT_RANGE 51
|
2023-05-22 14:08:20 +02:00
|
|
|
#define IP_PROTOCOL 52
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
#define MCAST_EXCLUDE 0
|
|
|
|
#define MCAST_INCLUDE 1
|
|
|
|
|
|
|
|
/* These need to appear somewhere around here */
|
|
|
|
#define IP_DEFAULT_MULTICAST_TTL 1
|
|
|
|
#define IP_DEFAULT_MULTICAST_LOOP 1
|
|
|
|
|
|
|
|
/* Request struct for multicast socket ops */
|
|
|
|
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_IP_MREQ
|
2012-10-13 10:46:48 +01:00
|
|
|
struct ip_mreq {
|
|
|
|
struct in_addr imr_multiaddr; /* IP multicast address of group */
|
|
|
|
struct in_addr imr_interface; /* local IP address of interface */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_mreqn {
|
|
|
|
struct in_addr imr_multiaddr; /* IP multicast address of group */
|
|
|
|
struct in_addr imr_address; /* local IP address of interface */
|
|
|
|
int imr_ifindex; /* Interface index */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_mreq_source {
|
|
|
|
__be32 imr_multiaddr;
|
|
|
|
__be32 imr_interface;
|
|
|
|
__be32 imr_sourceaddr;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_msfilter {
|
2022-08-31 14:12:42 -05:00
|
|
|
__be32 imsf_multiaddr;
|
|
|
|
__be32 imsf_interface;
|
|
|
|
__u32 imsf_fmode;
|
|
|
|
__u32 imsf_numsrc;
|
2021-07-31 12:08:30 -05:00
|
|
|
union {
|
2022-08-31 14:12:42 -05:00
|
|
|
__be32 imsf_slist[1];
|
|
|
|
__DECLARE_FLEX_ARRAY(__be32, imsf_slist_flex);
|
2021-07-31 12:08:30 -05:00
|
|
|
};
|
2012-10-13 10:46:48 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
#define IP_MSFILTER_SIZE(numsrc) \
|
|
|
|
(sizeof(struct ip_msfilter) - sizeof(__u32) \
|
|
|
|
+ (numsrc) * sizeof(__u32))
|
|
|
|
|
|
|
|
struct group_req {
|
|
|
|
__u32 gr_interface; /* interface index */
|
|
|
|
struct __kernel_sockaddr_storage gr_group; /* group address */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct group_source_req {
|
|
|
|
__u32 gsr_interface; /* interface index */
|
|
|
|
struct __kernel_sockaddr_storage gsr_group; /* group address */
|
|
|
|
struct __kernel_sockaddr_storage gsr_source; /* source address */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct group_filter {
|
2021-08-04 15:45:36 -05:00
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
__u32 gf_interface_aux; /* interface index */
|
|
|
|
struct __kernel_sockaddr_storage gf_group_aux; /* multicast address */
|
|
|
|
__u32 gf_fmode_aux; /* filter mode */
|
|
|
|
__u32 gf_numsrc_aux; /* number of sources */
|
|
|
|
struct __kernel_sockaddr_storage gf_slist[1]; /* interface index */
|
|
|
|
};
|
|
|
|
struct {
|
|
|
|
__u32 gf_interface; /* interface index */
|
|
|
|
struct __kernel_sockaddr_storage gf_group; /* multicast address */
|
|
|
|
__u32 gf_fmode; /* filter mode */
|
|
|
|
__u32 gf_numsrc; /* number of sources */
|
|
|
|
struct __kernel_sockaddr_storage gf_slist_flex[]; /* interface index */
|
|
|
|
};
|
|
|
|
};
|
2012-10-13 10:46:48 +01:00
|
|
|
};
|
|
|
|
|
|
|
|
#define GROUP_FILTER_SIZE(numsrc) \
|
|
|
|
(sizeof(struct group_filter) - sizeof(struct __kernel_sockaddr_storage) \
|
|
|
|
+ (numsrc) * sizeof(struct __kernel_sockaddr_storage))
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_IN_PKTINFO
|
2012-10-13 10:46:48 +01:00
|
|
|
struct in_pktinfo {
|
|
|
|
int ipi_ifindex;
|
|
|
|
struct in_addr ipi_spec_dst;
|
|
|
|
struct in_addr ipi_addr;
|
|
|
|
};
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
/* Structure describing an Internet (IP) socket address. */
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_SOCKADDR_IN
|
2012-10-13 10:46:48 +01:00
|
|
|
#define __SOCK_SIZE__ 16 /* sizeof(struct sockaddr) */
|
|
|
|
struct sockaddr_in {
|
|
|
|
__kernel_sa_family_t sin_family; /* Address family */
|
|
|
|
__be16 sin_port; /* Port number */
|
|
|
|
struct in_addr sin_addr; /* Internet address */
|
|
|
|
|
|
|
|
/* Pad to size of `struct sockaddr'. */
|
|
|
|
unsigned char __pad[__SOCK_SIZE__ - sizeof(short int) -
|
|
|
|
sizeof(unsigned short int) - sizeof(struct in_addr)];
|
|
|
|
};
|
|
|
|
#define sin_zero __pad /* for BSD UNIX comp. -FvK */
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
2015-06-29 14:57:48 -10:00
|
|
|
#if __UAPI_DEF_IN_CLASS
|
2012-10-13 10:46:48 +01:00
|
|
|
/*
|
|
|
|
* Definitions of the bits in an Internet address integer.
|
|
|
|
* On subnets, host and network parts are found according
|
|
|
|
* to the subnet mask, not these masks.
|
|
|
|
*/
|
|
|
|
#define IN_CLASSA(a) ((((long int) (a)) & 0x80000000) == 0)
|
|
|
|
#define IN_CLASSA_NET 0xff000000
|
|
|
|
#define IN_CLASSA_NSHIFT 24
|
|
|
|
#define IN_CLASSA_HOST (0xffffffff & ~IN_CLASSA_NET)
|
|
|
|
#define IN_CLASSA_MAX 128
|
|
|
|
|
|
|
|
#define IN_CLASSB(a) ((((long int) (a)) & 0xc0000000) == 0x80000000)
|
|
|
|
#define IN_CLASSB_NET 0xffff0000
|
|
|
|
#define IN_CLASSB_NSHIFT 16
|
|
|
|
#define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET)
|
|
|
|
#define IN_CLASSB_MAX 65536
|
|
|
|
|
|
|
|
#define IN_CLASSC(a) ((((long int) (a)) & 0xe0000000) == 0xc0000000)
|
|
|
|
#define IN_CLASSC_NET 0xffffff00
|
|
|
|
#define IN_CLASSC_NSHIFT 8
|
|
|
|
#define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET)
|
|
|
|
|
|
|
|
#define IN_CLASSD(a) ((((long int) (a)) & 0xf0000000) == 0xe0000000)
|
|
|
|
#define IN_MULTICAST(a) IN_CLASSD(a)
|
2018-12-11 15:30:34 -08:00
|
|
|
#define IN_MULTICAST_NET 0xe0000000
|
2012-10-13 10:46:48 +01:00
|
|
|
|
2019-01-10 21:24:13 +01:00
|
|
|
#define IN_BADCLASS(a) (((long int) (a) ) == (long int)0xffffffff)
|
2018-12-11 15:30:34 -08:00
|
|
|
#define IN_EXPERIMENTAL(a) IN_BADCLASS((a))
|
|
|
|
|
|
|
|
#define IN_CLASSE(a) ((((long int) (a)) & 0xf0000000) == 0xf0000000)
|
|
|
|
#define IN_CLASSE_NET 0xffffffff
|
|
|
|
#define IN_CLASSE_NSHIFT 0
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
/* Address to accept any incoming messages. */
|
|
|
|
#define INADDR_ANY ((unsigned long int) 0x00000000)
|
|
|
|
|
|
|
|
/* Address to send to all hosts. */
|
|
|
|
#define INADDR_BROADCAST ((unsigned long int) 0xffffffff)
|
|
|
|
|
|
|
|
/* Address indicating an error return. */
|
|
|
|
#define INADDR_NONE ((unsigned long int) 0xffffffff)
|
|
|
|
|
icmp: don't send out ICMP messages with a source address of 0.0.0.0
When constructing ICMP response messages, the kernel will try to pick a
suitable source address for the outgoing packet. However, if no IPv4
addresses are configured on the system at all, this will fail and we end up
producing an ICMP message with a source address of 0.0.0.0. This can happen
on a box routing IPv4 traffic via v6 nexthops, for instance.
Since 0.0.0.0 is not generally routable on the internet, there's a good
chance that such ICMP messages will never make it back to the sender of the
original packet that the ICMP message was sent in response to. This, in
turn, can create connectivity and PMTUd problems for senders. Fortunately,
RFC7600 reserves a dummy address to be used as a source for ICMP
messages (192.0.0.8/32), so let's teach the kernel to substitute that
address as a last resort if the regular source address selection procedure
fails.
Below is a quick example reproducing this issue with network namespaces:
ip netns add ns0
ip l add type veth peer netns ns0
ip l set dev veth0 up
ip a add 10.0.0.1/24 dev veth0
ip a add fc00:dead:cafe:42::1/64 dev veth0
ip r add 10.1.0.0/24 via inet6 fc00:dead:cafe:42::2
ip -n ns0 l set dev veth0 up
ip -n ns0 a add fc00:dead:cafe:42::2/64 dev veth0
ip -n ns0 r add 10.0.0.0/24 via inet6 fc00:dead:cafe:42::1
ip netns exec ns0 sysctl -w net.ipv4.icmp_ratelimit=0
ip netns exec ns0 sysctl -w net.ipv4.ip_forward=1
tcpdump -tpni veth0 -c 2 icmp &
ping -w 1 10.1.0.1 > /dev/null
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on veth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 29, seq 1, length 64
IP 0.0.0.0 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92
2 packets captured
2 packets received by filter
0 packets dropped by kernel
With this patch the above capture changes to:
IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 31127, seq 1, length 64
IP 192.0.0.8 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Juliusz Chroboczek <jch@irif.fr>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-18 13:04:35 +02:00
|
|
|
/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */
|
|
|
|
#define INADDR_DUMMY ((unsigned long int) 0xc0000008)
|
|
|
|
|
2012-10-13 10:46:48 +01:00
|
|
|
/* Network number for local host loopback. */
|
|
|
|
#define IN_LOOPBACKNET 127
|
|
|
|
|
|
|
|
/* Address to loopback in software to local host. */
|
|
|
|
#define INADDR_LOOPBACK 0x7f000001 /* 127.0.0.1 */
|
|
|
|
#define IN_LOOPBACK(a) ((((long int) (a)) & 0xff000000) == 0x7f000000)
|
|
|
|
|
|
|
|
/* Defines for Multicast INADDR */
|
2019-01-21 07:26:27 +01:00
|
|
|
#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */
|
|
|
|
#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */
|
|
|
|
#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */
|
|
|
|
#define INADDR_ALLSNOOPERS_GROUP 0xe000006aU /* 224.0.0.106 */
|
|
|
|
#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */
|
2015-06-29 14:57:48 -10:00
|
|
|
#endif
|
2012-10-13 10:46:48 +01:00
|
|
|
|
|
|
|
/* <asm/byteorder.h> contains the htonl type stuff.. */
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
|
|
|
|
|
|
|
|
#endif /* _UAPI_LINUX_IN_H */
|