From 4c3a76abd379d9a4668b2d417baa991de9757dc2 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 22 Aug 2010 19:03:26 +0000 Subject: [PATCH 01/18] bridge: netfilter: fix a memory leak nf_bridge_alloc() always reset the skb->nf_bridge, so we should always put the old one. Signed-off-by: Changli Gao Signed-off-by: Bart De Schuymer Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 2c911c0759c2..5ed00bd7009f 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -162,8 +162,8 @@ static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb) if (tmp) { memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info)); atomic_set(&tmp->use, 1); - nf_bridge_put(nf_bridge); } + nf_bridge_put(nf_bridge); nf_bridge = tmp; } return nf_bridge; From 7b589a35a1063b52f98c8b0f1b8f69afe91c3dba Mon Sep 17 00:00:00 2001 From: Yinglin Luan Date: Sun, 22 Aug 2010 21:56:19 +0000 Subject: [PATCH 02/18] netxen: fix poll implementation Function netxen_intr has pointer to nx_host_sds_ring as second parameter not pointer to netxen_adapter. Signed-off-by: Yinglin Luan Signed-off-by: David S. Miller --- drivers/net/netxen/netxen_nic_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c index cb30df106a2c..73d314592230 100644 --- a/drivers/net/netxen/netxen_nic_main.c +++ b/drivers/net/netxen/netxen_nic_main.c @@ -2131,9 +2131,16 @@ static int netxen_nic_poll(struct napi_struct *napi, int budget) #ifdef CONFIG_NET_POLL_CONTROLLER static void netxen_nic_poll_controller(struct net_device *netdev) { + int ring; + struct nx_host_sds_ring *sds_ring; struct netxen_adapter *adapter = netdev_priv(netdev); + struct netxen_recv_context *recv_ctx = &adapter->recv_ctx; + disable_irq(adapter->irq); - netxen_intr(adapter->irq, adapter); + for (ring = 0; ring < adapter->max_sds_rings; ring++) { + sds_ring = &recv_ctx->sds_rings[ring]; + netxen_intr(adapter->irq, sds_ring); + } enable_irq(adapter->irq); } #endif From bf82791ed667758a0f18a4b76be2d931d2c1b39d Mon Sep 17 00:00:00 2001 From: Yinglin Luan Date: Sun, 22 Aug 2010 21:57:56 +0000 Subject: [PATCH 03/18] qlcnic: fix poll implementation Function qlcnic_intr has pointer to qlcnic_host_sds_ring as second parameter not pointer to qlcnic_adapter. Signed-off-by: Yinglin Luan Signed-off-by: David S. Miller --- drivers/net/qlcnic/qlcnic_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c index 213e3656d953..66eea5972020 100644 --- a/drivers/net/qlcnic/qlcnic_main.c +++ b/drivers/net/qlcnic/qlcnic_main.c @@ -2188,9 +2188,16 @@ static int qlcnic_rx_poll(struct napi_struct *napi, int budget) #ifdef CONFIG_NET_POLL_CONTROLLER static void qlcnic_poll_controller(struct net_device *netdev) { + int ring; + struct qlcnic_host_sds_ring *sds_ring; struct qlcnic_adapter *adapter = netdev_priv(netdev); + struct qlcnic_recv_context *recv_ctx = &adapter->recv_ctx; + disable_irq(adapter->irq); - qlcnic_intr(adapter->irq, adapter); + for (ring = 0; ring < adapter->max_sds_rings; ring++) { + sds_ring = &recv_ctx->sds_rings[ring]; + qlcnic_intr(adapter->irq, sds_ring); + } enable_irq(adapter->irq); } #endif From aa25ab7d943a5e1e6bcc2a65ff6669144f5b5d60 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 11 Aug 2010 05:12:57 +0000 Subject: [PATCH 04/18] 3c59x: Fix deadlock between boomerang_interrupt and boomerang_start_tx If netconsole is in use, there is a possibility for deadlock in 3c59x between boomerang_interrupt and boomerang_start_xmit. Both routines take the vp->lock, and if netconsole is in use, a pr_* call from the boomerang_interrupt routine will result in the netconsole code attempting to trnasmit an skb, which can try to take the same spin lock, resulting in deadlock. The fix is pretty straightforward. This patch allocats a bit in the 3c59x private structure to indicate that its handling an interrupt. If we get into the transmit routine and that bit is set, we can be sure that we have recursed and will deadlock if we continue, so instead we just return NETDEV_TX_BUSY, so the stack requeues the skb to try again later. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- drivers/net/3c59x.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index c754d88e5ec9..c685a55fc2f4 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -633,7 +633,8 @@ struct vortex_private { open:1, medialock:1, must_free_region:1, /* Flag: if zero, Cardbus owns the I/O region */ - large_frames:1; /* accept large frames */ + large_frames:1, /* accept large frames */ + handling_irq:1; /* private in_irq indicator */ int drv_flags; u16 status_enable; u16 intr_enable; @@ -2133,6 +2134,15 @@ boomerang_start_xmit(struct sk_buff *skb, struct net_device *dev) dev->name, vp->cur_tx); } + /* + * We can't allow a recursion from our interrupt handler back into the + * tx routine, as they take the same spin lock, and that causes + * deadlock. Just return NETDEV_TX_BUSY and let the stack try again in + * a bit + */ + if (vp->handling_irq) + return NETDEV_TX_BUSY; + if (vp->cur_tx - vp->dirty_tx >= TX_RING_SIZE) { if (vortex_debug > 0) pr_warning("%s: BUG! Tx Ring full, refusing to send buffer.\n", @@ -2335,11 +2345,13 @@ boomerang_interrupt(int irq, void *dev_id) ioaddr = vp->ioaddr; + /* * It seems dopey to put the spinlock this early, but we could race against vortex_tx_timeout * and boomerang_start_xmit */ spin_lock(&vp->lock); + vp->handling_irq = 1; status = ioread16(ioaddr + EL3_STATUS); @@ -2447,6 +2459,7 @@ boomerang_interrupt(int irq, void *dev_id) pr_debug("%s: exiting interrupt, status %4.4x.\n", dev->name, status); handler_exit: + vp->handling_irq = 0; spin_unlock(&vp->lock); return IRQ_HANDLED; } From 9dc002d8d9c2af837e789b5bb88c61a8b32c1be8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 24 Aug 2010 12:21:13 -0700 Subject: [PATCH 05/18] caif-driver: add HAS_DMA dependency Fix this error on an s390 allyesconfig build: linux-2.6/drivers/net/caif/caif_spi.c:98: undefined reference to `dma_free_coherent' Cc: Sjur Braendeland Signed-off-by: Heiko Carstens Signed-off-by: David S. Miller --- drivers/net/caif/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index 631a6242b011..75bfc3a9d95f 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -15,7 +15,7 @@ config CAIF_TTY config CAIF_SPI_SLAVE tristate "CAIF SPI transport driver for slave interface" - depends on CAIF + depends on CAIF && HAS_DMA default n ---help--- The CAIF Link layer SPI Protocol driver for Slave SPI interface. From ef24b16b5d67c815874ed2d0e2581db629661ba3 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Tue, 24 Aug 2010 14:46:12 -0700 Subject: [PATCH 06/18] phylib: Fix race between returning phydev and calling adjust_link It is possible that phylib will call adjust_link before returning from {,of_}phy_connect(), which may cause the following [very rare, though] oops upon reopening the device: Unable to handle kernel paging request for data at address 0x0000024c Oops: Kernel access of bad area, sig: 11 [#1] PREEMPT SMP NR_CPUS=2 LTT NESTING LEVEL : 0 P1021 RDB Modules linked in: NIP: c0345dac LR: c0345dac CTR: c0345d84 TASK = dffab6b0[30] 'events/0' THREAD: c0d24000 CPU: 0 [...] NIP [c0345dac] adjust_link+0x28/0x19c LR [c0345dac] adjust_link+0x28/0x19c Call Trace: [c0d25f00] [000045e1] 0x45e1 (unreliable) [c0d25f30] [c036c158] phy_state_machine+0x3ac/0x554 [...] Here is why. Drivers store phydev in their private structures, e.g. gianfar driver: static int init_phy(struct net_device *dev) { ... priv->phydev = of_phy_connect(...); ... } So that adjust_link could retrieve it back: static void adjust_link(struct net_device *dev) { ... struct phy_device *phydev = priv->phydev; ... } If the device has been opened before, then phydev->state is set to PHY_HALTED (or undefined if the driver didn't call phy_stop()). Now, phy_connect starts the PHY state machine before returning phydev to the driver: phy_start_machine(phydev, NULL); if (phydev->irq > 0) phy_start_interrupts(phydev); return phydev; The time between 'phy_start_machine()' and 'return phydev' is undefined. The start machine routine delays execution for 1 second, which is enough for most cases. But under heavy load, or if you're unlucky, it is quite possible that PHY state machine will execute before phy_connect() returns, and so adjust_link callback will try to dereference phydev, which is not yet ready. To fix the issue, simply initialize the PHY's state to PHY_READY during phy_attach(). This will ensure that phylib won't call adjust_link before phy_start(). Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c0761197c07e..16ddc77313cb 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -466,6 +466,8 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, phydev->interface = interface; + phydev->state = PHY_READY; + /* Do initial configuration here, now that * we have certain key parameters * (dev_flags and interface) */ From 4169591fd7c260b2b0b4e8f4d51f63f5b15ad78a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2010 06:52:46 +0000 Subject: [PATCH 07/18] pxa168_eth: remove unneeded null check "pep->pd" isn't checked consistently in this function. For example it's dereferenced unconditionally on the next line after the end of the if condition. This function is only called from pxa168_eth_probe() and pep->pd is always non-NULL so I removed the check. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/pxa168_eth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/pxa168_eth.c b/drivers/net/pxa168_eth.c index ecc64d750cce..857a68115a5c 100644 --- a/drivers/net/pxa168_eth.c +++ b/drivers/net/pxa168_eth.c @@ -1414,10 +1414,8 @@ static int ethernet_phy_setup(struct net_device *dev) { struct pxa168_eth_private *pep = netdev_priv(dev); - if (pep->pd != NULL) { - if (pep->pd->init) - pep->pd->init(); - } + if (pep->pd->init) + pep->pd->init(); pep->phy = phy_scan(pep, pep->pd->phy_addr & 0x1f); if (pep->phy != NULL) phy_init(pep, pep->pd->speed, pep->pd->duplex); From 945c7c73e2e81d68e3e2970afd95254e8f153fc9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2010 06:53:33 +0000 Subject: [PATCH 08/18] pxa168_eth: fix error handling in prope A couple issues here: * Some resources weren't released. * If alloc_etherdev() failed it would have caused a NULL dereference because "pep" would be null when we checked "if (pep->clk)". * Also it's better to propagate the error codes from mdiobus_register() instead of just returning -ENOMEM. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/pxa168_eth.c | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/net/pxa168_eth.c b/drivers/net/pxa168_eth.c index 857a68115a5c..302fb480374b 100644 --- a/drivers/net/pxa168_eth.c +++ b/drivers/net/pxa168_eth.c @@ -1497,7 +1497,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) dev = alloc_etherdev(sizeof(struct pxa168_eth_private)); if (!dev) { err = -ENOMEM; - goto out; + goto err_clk; } platform_set_drvdata(pdev, dev); @@ -1507,12 +1507,12 @@ static int pxa168_eth_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (res == NULL) { err = -ENODEV; - goto out; + goto err_netdev; } pep->base = ioremap(res->start, res->end - res->start + 1); if (pep->base == NULL) { err = -ENOMEM; - goto out; + goto err_netdev; } res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); BUG_ON(!res); @@ -1549,7 +1549,7 @@ static int pxa168_eth_probe(struct platform_device *pdev) pep->smi_bus = mdiobus_alloc(); if (pep->smi_bus == NULL) { err = -ENOMEM; - goto out; + goto err_base; } pep->smi_bus->priv = pep; pep->smi_bus->name = "pxa168_eth smi"; @@ -1558,31 +1558,31 @@ static int pxa168_eth_probe(struct platform_device *pdev) snprintf(pep->smi_bus->id, MII_BUS_ID_SIZE, "%d", pdev->id); pep->smi_bus->parent = &pdev->dev; pep->smi_bus->phy_mask = 0xffffffff; - if (mdiobus_register(pep->smi_bus) < 0) { - err = -ENOMEM; - goto out; - } + err = mdiobus_register(pep->smi_bus); + if (err) + goto err_free_mdio; + pxa168_init_hw(pep); err = ethernet_phy_setup(dev); if (err) - goto out; + goto err_mdiobus; SET_NETDEV_DEV(dev, &pdev->dev); err = register_netdev(dev); if (err) - goto out; + goto err_mdiobus; return 0; -out: - if (pep->clk) { - clk_disable(pep->clk); - clk_put(pep->clk); - pep->clk = NULL; - } - if (pep->base) { - iounmap(pep->base); - pep->base = NULL; - } - if (dev) - free_netdev(dev); + +err_mdiobus: + mdiobus_unregister(pep->smi_bus); +err_free_mdio: + mdiobus_free(pep->smi_bus); +err_base: + iounmap(pep->base); +err_netdev: + free_netdev(dev); +err_clk: + clk_disable(clk); + clk_put(clk); return err; } From 4f2c85106883bada5167e1d42a0409e063da8895 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2010 06:54:20 +0000 Subject: [PATCH 09/18] pxa168_eth: update call to phy_mii_ioctl() The phy_mii_ioctl() function changed recently. It now takes a struct ifreq pointer directly. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/pxa168_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/pxa168_eth.c b/drivers/net/pxa168_eth.c index 302fb480374b..f324b767f685 100644 --- a/drivers/net/pxa168_eth.c +++ b/drivers/net/pxa168_eth.c @@ -1350,7 +1350,7 @@ static int pxa168_eth_do_ioctl(struct net_device *dev, struct ifreq *ifr, { struct pxa168_eth_private *pep = netdev_priv(dev); if (pep->phy != NULL) - return phy_mii_ioctl(pep->phy, if_mii(ifr), cmd); + return phy_mii_ioctl(pep->phy, ifr, cmd); return -EOPNOTSUPP; } From b2bc85631e72485b984bcd202a104591874babba Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Aug 2010 06:55:05 +0000 Subject: [PATCH 10/18] pxa168_eth: silence gcc warnings Casting "pep->tx_desc_dma" to to a struct tx_desc pointer makes gcc complain: drivers/net/pxa168_eth.c:657: warning: cast to pointer from integer of different size Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/pxa168_eth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/pxa168_eth.c b/drivers/net/pxa168_eth.c index f324b767f685..410ea0a61371 100644 --- a/drivers/net/pxa168_eth.c +++ b/drivers/net/pxa168_eth.c @@ -654,15 +654,15 @@ static void eth_port_start(struct net_device *dev) /* Assignment of Tx CTRP of given queue */ tx_curr_desc = pep->tx_curr_desc_q; wrl(pep, ETH_C_TX_DESC_1, - (u32) ((struct tx_desc *)pep->tx_desc_dma + tx_curr_desc)); + (u32) (pep->tx_desc_dma + tx_curr_desc * sizeof(struct tx_desc))); /* Assignment of Rx CRDP of given queue */ rx_curr_desc = pep->rx_curr_desc_q; wrl(pep, ETH_C_RX_DESC_0, - (u32) ((struct rx_desc *)pep->rx_desc_dma + rx_curr_desc)); + (u32) (pep->rx_desc_dma + rx_curr_desc * sizeof(struct rx_desc))); wrl(pep, ETH_F_RX_DESC_0, - (u32) ((struct rx_desc *)pep->rx_desc_dma + rx_curr_desc)); + (u32) (pep->rx_desc_dma + rx_curr_desc * sizeof(struct rx_desc))); /* Clear all interrupts */ wrl(pep, INT_CAUSE, 0); From ad1af0fedba14f82b240a03fe20eb9b2fdbd0357 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 25 Aug 2010 02:27:49 -0700 Subject: [PATCH 11/18] tcp: Combat per-cpu skew in orphan tests. As reported by Anton Blanchard when we use percpu_counter_read_positive() to make our orphan socket limit checks, the check can be off by up to num_cpus_online() * batch (which is 32 by default) which on a 128 cpu machine can be as large as the default orphan limit itself. Fix this by doing the full expensive sum check if the optimized check triggers. Reported-by: Anton Blanchard Signed-off-by: David S. Miller Acked-by: Eric Dumazet --- include/net/tcp.h | 18 ++++++++++++++---- net/ipv4/tcp.c | 5 +---- net/ipv4/tcp_timer.c | 8 ++++---- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index df6a2eb20193..eaa9582779d0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -268,11 +268,21 @@ static inline int between(__u32 seq1, __u32 seq2, __u32 seq3) return seq3 - seq2 >= seq1 - seq2; } -static inline int tcp_too_many_orphans(struct sock *sk, int num) +static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { - return (num > sysctl_tcp_max_orphans) || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]); + struct percpu_counter *ocp = sk->sk_prot->orphan_count; + int orphans = percpu_counter_read_positive(ocp); + + if (orphans << shift > sysctl_tcp_max_orphans) { + orphans = percpu_counter_sum_positive(ocp); + if (orphans << shift > sysctl_tcp_max_orphans) + return true; + } + + if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) + return true; + return false; } /* syncookies: remember time of last synqueue overflow */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11aaea77..197b9b77fa3e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2011,11 +2011,8 @@ void tcp_close(struct sock *sk, long timeout) } } if (sk->sk_state != TCP_CLOSE) { - int orphan_count = percpu_counter_read_positive( - sk->sk_prot->orphan_count); - sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, orphan_count)) { + if (tcp_too_many_orphans(sk, 0)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb920c9f5..c35b469e851c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -66,18 +66,18 @@ static void tcp_write_err(struct sock *sk) static int tcp_out_of_resources(struct sock *sk, int do_reset) { struct tcp_sock *tp = tcp_sk(sk); - int orphans = percpu_counter_read_positive(&tcp_orphan_count); + int shift = 0; /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) - orphans <<= 1; + shift++; /* If some dubious ICMP arrived, penalize even more. */ if (sk->sk_err_soft) - orphans <<= 1; + shift++; - if (tcp_too_many_orphans(sk, orphans)) { + if (tcp_too_many_orphans(sk, shift)) { if (net_ratelimit()) printk(KERN_INFO "Out of socket memory\n"); From c5ed63d66f24fd4f7089b5a6e087b0ce7202aa8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Aug 2010 23:02:17 -0700 Subject: [PATCH 12/18] tcp: fix three tcp sysctls tuning As discovered by Anton Blanchard, current code to autotune tcp_death_row.sysctl_max_tw_buckets, sysctl_tcp_max_orphans and sysctl_max_syn_backlog makes little sense. The bigger a page is, the less tcp_max_orphans is : 4096 on a 512GB machine in Anton's case. (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)) is much bigger if spinlock debugging is on. Its wrong to select bigger limits in this case (where kernel structures are also bigger) bhash_size max is 65536, and we get this value even for small machines. A better ground is to use size of ehash table, this also makes code shorter and more obvious. Based on a patch from Anton, and another from David. Reported-and-tested-by: Anton Blanchard Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 197b9b77fa3e..e2add5ff9cb1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3209,7 +3209,7 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long nr_pages, limit; - int order, i, max_share; + int i, max_share, cnt; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3258,22 +3258,12 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } - /* Try to be a bit smarter and adjust defaults depending - * on available memory. - */ - for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); - order++) - ; - if (order >= 4) { - tcp_death_row.sysctl_max_tw_buckets = 180000; - sysctl_tcp_max_orphans = 4096 << (order - 4); - sysctl_max_syn_backlog = 1024; - } else if (order < 3) { - tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); - sysctl_tcp_max_orphans >>= (3 - order); - sysctl_max_syn_backlog = 128; - } + + cnt = tcp_hashinfo.ehash_mask + 1; + + tcp_death_row.sysctl_max_tw_buckets = cnt / 2; + sysctl_tcp_max_orphans = cnt / 2; + sysctl_max_syn_backlog = max(128, cnt / 256); /* Set the pressure threshold to be a fraction of global memory that * is up to 1/2 at 256 MB, decreasing toward zero with the amount of From d84ba638e4ba3c40023ff997aa5e8d3ed002af36 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 24 Aug 2010 16:05:48 +0000 Subject: [PATCH 13/18] tcp: select(writefds) don't hang up when a peer close connection This issue come from ruby language community. Below test program hang up when only run on Linux. % uname -mrsv Linux 2.6.26-2-486 #1 Sat Dec 26 08:37:39 UTC 2009 i686 % ruby -rsocket -ve ' BasicSocket.do_not_reverse_lookup = true serv = TCPServer.open("127.0.0.1", 0) s1 = TCPSocket.open("127.0.0.1", serv.addr[1]) s2 = serv.accept s2.close s1.write("a") rescue p $! s1.write("a") rescue p $! Thread.new { s1.write("a") }.join' ruby 1.9.3dev (2010-07-06 trunk 28554) [i686-linux] # [Hang Here] FreeBSD, Solaris, Mac doesn't. because Ruby's write() method call select() internally. and tcp_poll has a bug. SUS defined 'ready for writing' of select() as following. | A descriptor shall be considered ready for writing when a call to an output | function with O_NONBLOCK clear would not block, whether or not the function | would transfer data successfully. That said, EPIPE situation is clearly one of 'ready for writing'. We don't have read-side issue because tcp_poll() already has read side shutdown care. | if (sk->sk_shutdown & RCV_SHUTDOWN) | mask |= POLLIN | POLLRDNORM | POLLRDHUP; So, Let's insert same logic in write side. - reference url http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31065 http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-core/31068 Signed-off-by: KOSAKI Motohiro Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2add5ff9cb1..3fb1428e526e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -451,7 +451,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } - } + } else + mask |= POLLOUT | POLLWRNORM; if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; From bfc960a8eec023a170a80697fe65157cd4f44f81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Aug 2010 23:44:35 +0000 Subject: [PATCH 14/18] l2tp: test for ethernet header in l2tp_eth_dev_recv() close https://bugzilla.kernel.org/show_bug.cgi?id=16529 Before calling dev_forward_skb(), we should make sure skb head contains at least an ethernet header, even if length included in upper layer said so. Use pskb_may_pull() to make sure this ethernet header is present in skb head. Reported-by: Thomas Heil Reported-by: Ian Campbell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 58c6c4cda73b..1ae697681bc7 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -132,7 +132,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, printk("\n"); } - if (data_len < ETH_HLEN) + if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) goto error; secpath_reset(skb); From fe5f098055ed7f701bfb16f8f87378cf67de9a0e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 26 Aug 2010 08:27:58 +0000 Subject: [PATCH 15/18] qlge: reset the chip before freeing the buffers Qlge is freeing the buffers before stopping the card DMA, and this can cause some severe error, as a EEH event on PPC. This patch just stop the card and then free the resources. Signed-off-by: Breno Leitao Signed-off-by: Ron Mercer Signed-off-by: David S. Miller --- drivers/net/qlge/qlge_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/qlge/qlge_main.c b/drivers/net/qlge/qlge_main.c index 8d63f69b27d9..5f89e83501f4 100644 --- a/drivers/net/qlge/qlge_main.c +++ b/drivers/net/qlge/qlge_main.c @@ -3919,12 +3919,12 @@ static int ql_adapter_down(struct ql_adapter *qdev) for (i = 0; i < qdev->rss_ring_count; i++) netif_napi_del(&qdev->rx_ring[i].napi); - ql_free_rx_buffers(qdev); - status = ql_adapter_reset(qdev); if (status) netif_err(qdev, ifdown, qdev->ndev, "reset(func #%d) FAILED!\n", qdev->func); + ql_free_rx_buffers(qdev); + return status; } From d71b0e9c0028f3af910226f995e0074873e16979 Mon Sep 17 00:00:00 2001 From: Bernard Pidoux F6BVP Date: Thu, 26 Aug 2010 11:40:00 +0000 Subject: [PATCH 16/18] ax25: missplaced sock_put(sk) This patch moves a missplaced sock_put(sk) after bh_unlock_sock(sk) like in other parts of AX25 driver. Signed-off-by: Bernard Pidoux Signed-off-by: David S. Miller --- net/ax25/ax25_ds_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index 2ce79df00680..c7d81436213d 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -112,8 +112,8 @@ void ax25_ds_heartbeat_expiry(ax25_cb *ax25) if (sk) { sock_hold(sk); ax25_destroy_socket(ax25); - sock_put(sk); bh_unlock_sock(sk); + sock_put(sk); } else ax25_destroy_socket(ax25); return; From 7e368739e3b3f1d7944794c178a15f05829b56bc Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 26 Aug 2010 16:11:08 -0700 Subject: [PATCH 17/18] net/caif/cfrfml.c: use asm/unaligned.h caif does not build on ia64 starting with 2.6.32-rc1. Using asm/unaligned.h instead of linux/unaligned/le_byteshift.h fixes the issue. include/linux/unaligned/le_byteshift.h:40:50: error: redefinition of 'get_unaligned_le16' include/linux/unaligned/le_byteshift.h:45:50: error: redefinition of 'get_unaligned_le32' include/linux/unaligned/le_byteshift.h:50:50: error: redefinition of 'get_unaligned_le64' include/linux/unaligned/le_byteshift.h:55:51: error: redefinition of 'put_unaligned_le16' include/linux/unaligned/le_byteshift.h:60:51: error: redefinition of 'put_unaligned_le32' include/linux/unaligned/le_byteshift.h:65:51: error: redefinition of 'put_unaligned_le64' include/linux/unaligned/le_struct.h:31:51: note: previous definition of 'put_unaligned_le64' was here Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/caif/cfrfml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index eb1602022ac0..9a699242d104 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include From c34186ed008229e7f7e3f1de8e6acf6374995358 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 27 Aug 2010 19:31:56 -0700 Subject: [PATCH 18/18] net/ipv4: Eliminate kstrdup memory leak The string clone is only used as a temporary copy of the argument val within the while loop, and so it should be freed before leaving the function. The call to strsep, however, modifies clone, so a pointer to the front of the string is kept in saved_clone, to make it possible to free it. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r exists@ local idexpression x; expression E; identifier l; statement S; @@ *x= \(kasprintf\|kstrdup\)(...); ... if (x == NULL) S ... when != kfree(x) when != E = x if (...) { <... when != kfree(x) * goto l; ...> * return ...; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0ec9bd0ae94f..850c737e08e2 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -196,10 +196,10 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; - char *clone, *name; + char *saved_clone, *clone, *name; int ret = 0; - clone = kstrdup(val, GFP_USER); + saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; @@ -226,6 +226,7 @@ int tcp_set_allowed_congestion_control(char *val) } out: spin_unlock(&tcp_cong_list_lock); + kfree(saved_clone); return ret; }