2005-08-09 20:09:46 -07:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Generic TIME_WAIT sockets functions
|
|
|
|
*
|
|
|
|
* From code orinally in TCP
|
|
|
|
*/
|
|
|
|
|
2007-08-28 15:50:33 -07:00
|
|
|
#include <linux/kernel.h>
|
2005-08-09 20:09:46 -07:00
|
|
|
#include <net/inet_hashtables.h>
|
|
|
|
#include <net/inet_timewait_sock.h>
|
2005-08-09 20:45:03 -07:00
|
|
|
#include <net/ip.h>
|
2005-08-09 20:09:46 -07:00
|
|
|
|
|
|
|
/* Must be called with locally disabled BHs. */
|
2007-07-14 19:00:59 -07:00
|
|
|
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
|
|
|
|
struct inet_hashinfo *hashinfo)
|
2005-08-09 20:09:46 -07:00
|
|
|
{
|
|
|
|
struct inet_bind_hashbucket *bhead;
|
|
|
|
struct inet_bind_bucket *tb;
|
|
|
|
/* Unlink from established hashes. */
|
2007-11-07 02:40:20 -08:00
|
|
|
rwlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
|
2005-08-09 20:09:46 -07:00
|
|
|
|
2007-11-07 02:40:20 -08:00
|
|
|
write_lock(lock);
|
2005-08-09 20:09:46 -07:00
|
|
|
if (hlist_unhashed(&tw->tw_node)) {
|
2007-11-07 02:40:20 -08:00
|
|
|
write_unlock(lock);
|
2005-08-09 20:09:46 -07:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
__hlist_del(&tw->tw_node);
|
|
|
|
sk_node_init(&tw->tw_node);
|
2007-11-07 02:40:20 -08:00
|
|
|
write_unlock(lock);
|
2005-08-09 20:09:46 -07:00
|
|
|
|
|
|
|
/* Disassociate with bind bucket. */
|
2008-06-16 17:12:49 -07:00
|
|
|
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
|
|
|
|
hashinfo->bhash_size)];
|
2005-08-09 20:09:46 -07:00
|
|
|
spin_lock(&bhead->lock);
|
|
|
|
tb = tw->tw_tb;
|
|
|
|
__hlist_del(&tw->tw_bind_node);
|
|
|
|
tw->tw_tb = NULL;
|
|
|
|
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
|
|
|
spin_unlock(&bhead->lock);
|
|
|
|
#ifdef SOCK_REFCNT_DEBUG
|
|
|
|
if (atomic_read(&tw->tw_refcnt) != 1) {
|
|
|
|
printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
|
|
|
|
tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
inet_twsk_put(tw);
|
|
|
|
}
|
|
|
|
|
2007-12-20 15:32:54 -08:00
|
|
|
void inet_twsk_put(struct inet_timewait_sock *tw)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&tw->tw_refcnt)) {
|
|
|
|
struct module *owner = tw->tw_prot->owner;
|
|
|
|
twsk_destructor((struct sock *)tw);
|
|
|
|
#ifdef SOCK_REFCNT_DEBUG
|
|
|
|
printk(KERN_DEBUG "%s timewait_sock %p released\n",
|
|
|
|
tw->tw_prot->name, tw);
|
|
|
|
#endif
|
2008-04-16 02:00:28 -07:00
|
|
|
release_net(twsk_net(tw));
|
2007-12-20 15:32:54 -08:00
|
|
|
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
|
|
|
|
module_put(owner);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twsk_put);
|
|
|
|
|
2005-08-09 20:09:46 -07:00
|
|
|
/*
|
|
|
|
* Enter the time wait state. This is called with locally disabled BH.
|
|
|
|
* Essentially we whip up a timewait bucket, copy the relevant info into it
|
|
|
|
* from the SK, and mess with hash chains and list linkage.
|
|
|
|
*/
|
|
|
|
void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
|
|
|
|
struct inet_hashinfo *hashinfo)
|
|
|
|
{
|
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
2005-08-09 20:10:42 -07:00
|
|
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-03 14:13:38 -07:00
|
|
|
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
|
2007-11-07 02:40:20 -08:00
|
|
|
rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
2005-08-09 20:09:46 -07:00
|
|
|
struct inet_bind_hashbucket *bhead;
|
|
|
|
/* Step 1: Put TW into bind hash. Original socket stays there too.
|
|
|
|
Note, that any socket with inet->num != 0 MUST be bound in
|
|
|
|
binding cache, even if it is closed.
|
|
|
|
*/
|
2008-06-16 17:12:49 -07:00
|
|
|
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num,
|
|
|
|
hashinfo->bhash_size)];
|
2005-08-09 20:09:46 -07:00
|
|
|
spin_lock(&bhead->lock);
|
2005-08-09 20:10:42 -07:00
|
|
|
tw->tw_tb = icsk->icsk_bind_hash;
|
2008-07-25 21:43:18 -07:00
|
|
|
WARN_ON(!icsk->icsk_bind_hash);
|
2005-08-09 20:09:46 -07:00
|
|
|
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
|
|
|
|
spin_unlock(&bhead->lock);
|
|
|
|
|
2007-11-07 02:40:20 -08:00
|
|
|
write_lock(lock);
|
2005-08-09 20:09:46 -07:00
|
|
|
|
|
|
|
/* Step 2: Remove SK from established hash. */
|
|
|
|
if (__sk_del_node_init(sk))
|
2008-03-31 19:41:46 -07:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
2005-08-09 20:09:46 -07:00
|
|
|
|
[NET]: change layout of ehash table
ehash table layout is currently this one :
First half of this table is used by sockets not in TIME_WAIT state
Second half of it is used by sockets in TIME_WAIT state.
This is non optimal because of for a given hash or socket, the two chain heads
are located in separate cache lines.
Moreover the locks of the second half are never used.
If instead of this halving, we use two list heads in inet_ehash_bucket instead
of only one, we probably can avoid one cache miss, and reduce ram usage,
particularly if sizeof(rwlock_t) is big (various CONFIG_DEBUG_SPINLOCK,
CONFIG_DEBUG_LOCK_ALLOC settings). So we still halves the table but we keep
together related chains to speedup lookups and socket state change.
In this patch I did not try to align struct inet_ehash_bucket, but a future
patch could try to make this structure have a convenient size (a power of two
or a multiple of L1_CACHE_SIZE).
I guess rwlock will just vanish as soon as RCU is plugged into ehash :) , so
maybe we dont need to scratch our heads to align the bucket...
Note : In case struct inet_ehash_bucket is not a power of two, we could
probably change alloc_large_system_hash() (in case it use __get_free_pages())
to free the unused space. It currently allocates a big zone, but the last
quarter of it could be freed. Again, this should be a temporary 'problem'.
Patch tested on ipv4 tcp only, but should be OK for IPV6 and DCCP.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2007-02-08 14:16:46 -08:00
|
|
|
/* Step 3: Hash TW into TIMEWAIT chain. */
|
|
|
|
inet_twsk_add_node(tw, &ehead->twchain);
|
2005-08-09 20:09:46 -07:00
|
|
|
atomic_inc(&tw->tw_refcnt);
|
|
|
|
|
2007-11-07 02:40:20 -08:00
|
|
|
write_unlock(lock);
|
2005-08-09 20:09:46 -07:00
|
|
|
}
|
2005-08-09 20:09:59 -07:00
|
|
|
|
2005-08-09 20:45:03 -07:00
|
|
|
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
|
|
|
|
|
2005-08-09 20:09:59 -07:00
|
|
|
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
|
|
|
|
{
|
2005-12-13 23:25:19 -08:00
|
|
|
struct inet_timewait_sock *tw =
|
|
|
|
kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
|
2006-12-06 20:33:16 -08:00
|
|
|
GFP_ATOMIC);
|
2005-08-09 20:09:59 -07:00
|
|
|
if (tw != NULL) {
|
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
|
|
|
/* Give us an identity. */
|
|
|
|
tw->tw_daddr = inet->daddr;
|
|
|
|
tw->tw_rcv_saddr = inet->rcv_saddr;
|
|
|
|
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
|
|
|
|
tw->tw_num = inet->num;
|
|
|
|
tw->tw_state = TCP_TIME_WAIT;
|
|
|
|
tw->tw_substate = state;
|
|
|
|
tw->tw_sport = inet->sport;
|
|
|
|
tw->tw_dport = inet->dport;
|
|
|
|
tw->tw_family = sk->sk_family;
|
|
|
|
tw->tw_reuse = sk->sk_reuse;
|
[INET]: speedup inet (tcp/dccp) lookups
Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)
(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)
1) First some performance data :
--------------------------------
tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()
The most time critical code is :
sk_for_each(sk, node, &head->chain) {
if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
goto hit; /* You sunk my battleship! */
}
The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.
As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.
This can be problematic if some chains are very long.
2) The goal
-----------
The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.
3) Description of the patch
---------------------------
Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.
struct sock_common {
unsigned short skc_family;
volatile unsigned char skc_state;
unsigned char skc_reuse;
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node;
atomic_t skc_refcnt;
+ unsigned int skc_hash;
struct proto *skc_prot;
};
Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.
Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)
File include/net/inet_hashtables.h
64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash))
((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->sk_hash == (__hash)) && \
(inet_sk(__sk)->daddr == (__saddr)) && \
(inet_sk(__sk)->rcv_saddr == (__daddr)) && \
(!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-10-03 14:13:38 -07:00
|
|
|
tw->tw_hash = sk->sk_hash;
|
2005-08-09 20:09:59 -07:00
|
|
|
tw->tw_ipv6only = 0;
|
|
|
|
tw->tw_prot = sk->sk_prot_creator;
|
2008-04-16 02:00:28 -07:00
|
|
|
twsk_net_set(tw, hold_net(sock_net(sk)));
|
2005-08-09 20:09:59 -07:00
|
|
|
atomic_set(&tw->tw_refcnt, 1);
|
|
|
|
inet_twsk_dead_node_init(tw);
|
2005-10-10 21:25:23 -07:00
|
|
|
__module_get(tw->tw_prot->owner);
|
2005-08-09 20:09:59 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return tw;
|
|
|
|
}
|
2005-08-09 20:45:03 -07:00
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twsk_alloc);
|
|
|
|
|
|
|
|
/* Returns non-zero if quota exceeded. */
|
|
|
|
static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
|
|
|
|
const int slot)
|
|
|
|
{
|
|
|
|
struct inet_timewait_sock *tw;
|
|
|
|
struct hlist_node *node;
|
|
|
|
unsigned int killed;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* NOTE: compare this to previous version where lock
|
|
|
|
* was released after detaching chain. It was racy,
|
|
|
|
* because tw buckets are scheduled in not serialized context
|
|
|
|
* in 2.3 (with netfilter), and with softnet it is common, because
|
|
|
|
* soft irqs are not sequenced.
|
|
|
|
*/
|
|
|
|
killed = 0;
|
|
|
|
ret = 0;
|
|
|
|
rescan:
|
|
|
|
inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) {
|
|
|
|
__inet_twsk_del_dead_node(tw);
|
|
|
|
spin_unlock(&twdr->death_lock);
|
|
|
|
__inet_twsk_kill(tw, twdr->hashinfo);
|
2008-07-16 20:32:25 -07:00
|
|
|
#ifdef CONFIG_NET_NS
|
|
|
|
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
|
|
|
|
#endif
|
2005-08-09 20:45:03 -07:00
|
|
|
inet_twsk_put(tw);
|
|
|
|
killed++;
|
|
|
|
spin_lock(&twdr->death_lock);
|
|
|
|
if (killed > INET_TWDR_TWKILL_QUOTA) {
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* While we dropped twdr->death_lock, another cpu may have
|
|
|
|
* killed off the next TW bucket in the list, therefore
|
|
|
|
* do a fresh re-read of the hlist head node with the
|
|
|
|
* lock reacquired. We still use the hlist traversal
|
|
|
|
* macro in order to get the prefetches.
|
|
|
|
*/
|
|
|
|
goto rescan;
|
|
|
|
}
|
|
|
|
|
|
|
|
twdr->tw_count -= killed;
|
2008-07-16 20:32:25 -07:00
|
|
|
#ifndef CONFIG_NET_NS
|
|
|
|
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
|
|
|
|
#endif
|
2005-08-09 20:45:03 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void inet_twdr_hangman(unsigned long data)
|
|
|
|
{
|
|
|
|
struct inet_timewait_death_row *twdr;
|
|
|
|
int unsigned need_timer;
|
|
|
|
|
|
|
|
twdr = (struct inet_timewait_death_row *)data;
|
|
|
|
spin_lock(&twdr->death_lock);
|
|
|
|
|
|
|
|
if (twdr->tw_count == 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
need_timer = 0;
|
|
|
|
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
|
|
|
|
twdr->thread_slots |= (1 << twdr->slot);
|
|
|
|
schedule_work(&twdr->twkill_work);
|
|
|
|
need_timer = 1;
|
|
|
|
} else {
|
|
|
|
/* We purged the entire slot, anything left? */
|
|
|
|
if (twdr->tw_count)
|
|
|
|
need_timer = 1;
|
|
|
|
}
|
|
|
|
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
|
|
|
|
if (need_timer)
|
|
|
|
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
|
|
|
|
out:
|
|
|
|
spin_unlock(&twdr->death_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
|
|
|
|
|
2006-11-22 14:55:48 +00:00
|
|
|
void inet_twdr_twkill_work(struct work_struct *work)
|
2005-08-09 20:45:03 -07:00
|
|
|
{
|
2006-11-22 14:55:48 +00:00
|
|
|
struct inet_timewait_death_row *twdr =
|
|
|
|
container_of(work, struct inet_timewait_death_row, twkill_work);
|
2005-08-09 20:45:03 -07:00
|
|
|
int i;
|
|
|
|
|
2007-12-11 02:12:36 -08:00
|
|
|
BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
|
|
|
|
(sizeof(twdr->thread_slots) * 8));
|
2005-08-09 20:45:03 -07:00
|
|
|
|
|
|
|
while (twdr->thread_slots) {
|
|
|
|
spin_lock_bh(&twdr->death_lock);
|
|
|
|
for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
|
|
|
|
if (!(twdr->thread_slots & (1 << i)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
while (inet_twdr_do_twkill_work(twdr, i) != 0) {
|
|
|
|
if (need_resched()) {
|
|
|
|
spin_unlock_bh(&twdr->death_lock);
|
|
|
|
schedule();
|
|
|
|
spin_lock_bh(&twdr->death_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
twdr->thread_slots &= ~(1 << i);
|
|
|
|
}
|
|
|
|
spin_unlock_bh(&twdr->death_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
|
|
|
|
|
|
|
|
/* These are always called from BH context. See callers in
|
|
|
|
* tcp_input.c to verify this.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* This is for handling early-kills of TIME_WAIT sockets. */
|
|
|
|
void inet_twsk_deschedule(struct inet_timewait_sock *tw,
|
|
|
|
struct inet_timewait_death_row *twdr)
|
|
|
|
{
|
|
|
|
spin_lock(&twdr->death_lock);
|
|
|
|
if (inet_twsk_del_dead_node(tw)) {
|
|
|
|
inet_twsk_put(tw);
|
|
|
|
if (--twdr->tw_count == 0)
|
|
|
|
del_timer(&twdr->tw_timer);
|
|
|
|
}
|
|
|
|
spin_unlock(&twdr->death_lock);
|
|
|
|
__inet_twsk_kill(tw, twdr->hashinfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(inet_twsk_deschedule);
|
|
|
|
|
|
|
|
void inet_twsk_schedule(struct inet_timewait_sock *tw,
|
|
|
|
struct inet_timewait_death_row *twdr,
|
|
|
|
const int timeo, const int timewait_len)
|
|
|
|
{
|
|
|
|
struct hlist_head *list;
|
|
|
|
int slot;
|
|
|
|
|
|
|
|
/* timeout := RTO * 3.5
|
|
|
|
*
|
|
|
|
* 3.5 = 1+2+0.5 to wait for two retransmits.
|
|
|
|
*
|
|
|
|
* RATIONALE: if FIN arrived and we entered TIME-WAIT state,
|
|
|
|
* our ACK acking that FIN can be lost. If N subsequent retransmitted
|
|
|
|
* FINs (or previous seqments) are lost (probability of such event
|
|
|
|
* is p^(N+1), where p is probability to lose single packet and
|
|
|
|
* time to detect the loss is about RTO*(2^N - 1) with exponential
|
|
|
|
* backoff). Normal timewait length is calculated so, that we
|
|
|
|
* waited at least for one retransmitted FIN (maximal RTO is 120sec).
|
|
|
|
* [ BTW Linux. following BSD, violates this requirement waiting
|
|
|
|
* only for 60sec, we should wait at least for 240 secs.
|
|
|
|
* Well, 240 consumes too much of resources 8)
|
|
|
|
* ]
|
|
|
|
* This interval is not reduced to catch old duplicate and
|
|
|
|
* responces to our wandering segments living for two MSLs.
|
|
|
|
* However, if we use PAWS to detect
|
|
|
|
* old duplicates, we can reduce the interval to bounds required
|
|
|
|
* by RTO, rather than MSL. So, if peer understands PAWS, we
|
|
|
|
* kill tw bucket after 3.5*RTO (it is important that this number
|
|
|
|
* is greater than TS tick!) and detect old duplicates with help
|
|
|
|
* of PAWS.
|
|
|
|
*/
|
|
|
|
slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
|
|
|
|
|
|
|
|
spin_lock(&twdr->death_lock);
|
|
|
|
|
|
|
|
/* Unlink it, if it was scheduled */
|
|
|
|
if (inet_twsk_del_dead_node(tw))
|
|
|
|
twdr->tw_count--;
|
|
|
|
else
|
|
|
|
atomic_inc(&tw->tw_refcnt);
|
|
|
|
|
|
|
|
if (slot >= INET_TWDR_RECYCLE_SLOTS) {
|
|
|
|
/* Schedule to slow timer */
|
|
|
|
if (timeo >= timewait_len) {
|
|
|
|
slot = INET_TWDR_TWKILL_SLOTS - 1;
|
|
|
|
} else {
|
2007-08-28 15:50:33 -07:00
|
|
|
slot = DIV_ROUND_UP(timeo, twdr->period);
|
2005-08-09 20:45:03 -07:00
|
|
|
if (slot >= INET_TWDR_TWKILL_SLOTS)
|
|
|
|
slot = INET_TWDR_TWKILL_SLOTS - 1;
|
|
|
|
}
|
|
|
|
tw->tw_ttd = jiffies + timeo;
|
|
|
|
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
|
|
|
|
list = &twdr->cells[slot];
|
|
|
|
} else {
|
|
|
|
tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
|
|
|
|
|
|
|
|
if (twdr->twcal_hand < 0) {
|
|
|
|
twdr->twcal_hand = 0;
|
|
|
|
twdr->twcal_jiffie = jiffies;
|
|
|
|
twdr->twcal_timer.expires = twdr->twcal_jiffie +
|
|
|
|
(slot << INET_TWDR_RECYCLE_TICK);
|
|
|
|
add_timer(&twdr->twcal_timer);
|
|
|
|
} else {
|
|
|
|
if (time_after(twdr->twcal_timer.expires,
|
|
|
|
jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
|
|
|
|
mod_timer(&twdr->twcal_timer,
|
|
|
|
jiffies + (slot << INET_TWDR_RECYCLE_TICK));
|
|
|
|
slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
|
|
|
|
}
|
|
|
|
list = &twdr->twcal_row[slot];
|
|
|
|
}
|
|
|
|
|
|
|
|
hlist_add_head(&tw->tw_death_node, list);
|
|
|
|
|
|
|
|
if (twdr->tw_count++ == 0)
|
|
|
|
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
|
|
|
|
spin_unlock(&twdr->death_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twsk_schedule);
|
|
|
|
|
|
|
|
void inet_twdr_twcal_tick(unsigned long data)
|
|
|
|
{
|
|
|
|
struct inet_timewait_death_row *twdr;
|
|
|
|
int n, slot;
|
|
|
|
unsigned long j;
|
|
|
|
unsigned long now = jiffies;
|
|
|
|
int killed = 0;
|
|
|
|
int adv = 0;
|
|
|
|
|
|
|
|
twdr = (struct inet_timewait_death_row *)data;
|
|
|
|
|
|
|
|
spin_lock(&twdr->death_lock);
|
|
|
|
if (twdr->twcal_hand < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
slot = twdr->twcal_hand;
|
|
|
|
j = twdr->twcal_jiffie;
|
|
|
|
|
|
|
|
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
|
|
|
|
if (time_before_eq(j, now)) {
|
|
|
|
struct hlist_node *node, *safe;
|
|
|
|
struct inet_timewait_sock *tw;
|
|
|
|
|
|
|
|
inet_twsk_for_each_inmate_safe(tw, node, safe,
|
|
|
|
&twdr->twcal_row[slot]) {
|
|
|
|
__inet_twsk_del_dead_node(tw);
|
|
|
|
__inet_twsk_kill(tw, twdr->hashinfo);
|
2008-07-16 20:32:25 -07:00
|
|
|
#ifdef CONFIG_NET_NS
|
|
|
|
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
|
|
|
|
#endif
|
2005-08-09 20:45:03 -07:00
|
|
|
inet_twsk_put(tw);
|
|
|
|
killed++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (!adv) {
|
|
|
|
adv = 1;
|
|
|
|
twdr->twcal_jiffie = j;
|
|
|
|
twdr->twcal_hand = slot;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!hlist_empty(&twdr->twcal_row[slot])) {
|
|
|
|
mod_timer(&twdr->twcal_timer, j);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
j += 1 << INET_TWDR_RECYCLE_TICK;
|
|
|
|
slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
|
|
|
|
}
|
|
|
|
twdr->twcal_hand = -1;
|
|
|
|
|
|
|
|
out:
|
|
|
|
if ((twdr->tw_count -= killed) == 0)
|
|
|
|
del_timer(&twdr->tw_timer);
|
2008-07-16 20:32:25 -07:00
|
|
|
#ifndef CONFIG_NET_NS
|
|
|
|
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
|
|
|
|
#endif
|
2005-08-09 20:45:03 -07:00
|
|
|
spin_unlock(&twdr->death_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
|
netns : fix kernel panic in timewait socket destruction
How to reproduce ?
- create a network namespace
- use tcp protocol and get timewait socket
- exit the network namespace
- after a moment (when the timewait socket is destroyed), the kernel
panics.
# BUG: unable to handle kernel NULL pointer dereference at
0000000000000007
IP: [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
PGD 119985067 PUD 11c5c0067 PMD 0
Oops: 0000 [1] SMP
CPU 1
Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd
edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks
sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table]
Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3
RIP: 0010:[<ffffffff821e394d>] [<ffffffff821e394d>]
inet_twdr_do_twkill_work+0x6e/0xb8
RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246
RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30
RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00
RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200
R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00
R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000
FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000)
knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff8800bff9e000, task
ffff88011ff76690)
Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a
0000000000000008
0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7
ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108
Call Trace:
<IRQ> [<ffffffff821e3a3a>] ? inet_twdr_hangman+0x0/0x9e
[<ffffffff821e3a61>] ? inet_twdr_hangman+0x27/0x9e
[<ffffffff8203c7e7>] ? run_timer_softirq+0x12c/0x193
[<ffffffff820390d1>] ? __do_softirq+0x5e/0xcd
[<ffffffff8200d08c>] ? call_softirq+0x1c/0x28
[<ffffffff8200e611>] ? do_softirq+0x2c/0x68
[<ffffffff8201a055>] ? smp_apic_timer_interrupt+0x8e/0xa9
[<ffffffff8200cad6>] ? apic_timer_interrupt+0x66/0x70
<EOI> [<ffffffff82011f4c>] ? default_idle+0x27/0x3b
[<ffffffff8200abbd>] ? cpu_idle+0x5f/0x7d
Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7
65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0
48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00
RIP [<ffffffff821e394d>] inet_twdr_do_twkill_work+0x6e/0xb8
RSP <ffff88011ff7fed0>
CR2: 0000000000000007
This patch provides a function to purge all timewait sockets related
to a network namespace. The timewait sockets life cycle is not tied with
the network namespace, that means the timewait sockets stay alive while
the network namespace dies. The timewait sockets are for avoiding to
receive a duplicate packet from the network, if the network namespace is
freed, the network stack is removed, so no chance to receive any packets
from the outside world. Furthermore, having a pending destruction timer
on these sockets with a network namespace freed is not safe and will lead
to an oops if the timer callback which try to access data belonging to
the namespace like for example in:
inet_twdr_do_twkill_work
-> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
Purging the timewait sockets at the network namespace destruction will:
1) speed up memory freeing for the namespace
2) fix kernel panic on asynchronous timewait destruction
Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Denis V. Lunev <den@openvz.org>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-09-08 13:17:27 -07:00
|
|
|
|
|
|
|
void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
|
|
|
|
struct inet_timewait_death_row *twdr, int family)
|
|
|
|
{
|
|
|
|
struct inet_timewait_sock *tw;
|
|
|
|
struct sock *sk;
|
|
|
|
struct hlist_node *node;
|
|
|
|
int h;
|
|
|
|
|
|
|
|
local_bh_disable();
|
|
|
|
for (h = 0; h < (hashinfo->ehash_size); h++) {
|
|
|
|
struct inet_ehash_bucket *head =
|
|
|
|
inet_ehash_bucket(hashinfo, h);
|
|
|
|
rwlock_t *lock = inet_ehash_lockp(hashinfo, h);
|
|
|
|
restart:
|
|
|
|
write_lock(lock);
|
|
|
|
sk_for_each(sk, node, &head->twchain) {
|
|
|
|
|
|
|
|
tw = inet_twsk(sk);
|
|
|
|
if (!net_eq(twsk_net(tw), net) ||
|
|
|
|
tw->tw_family != family)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
atomic_inc(&tw->tw_refcnt);
|
|
|
|
write_unlock(lock);
|
|
|
|
inet_twsk_deschedule(tw, twdr);
|
|
|
|
inet_twsk_put(tw);
|
|
|
|
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
write_unlock(lock);
|
|
|
|
}
|
|
|
|
local_bh_enable();
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet_twsk_purge);
|