From 1a0150a93c496986297fc08304ac564213c08942 Mon Sep 17 00:00:00 2001 From: "brenohl@br.ibm.com" Date: Fri, 27 Jul 2012 08:54:52 +0000 Subject: [PATCH 01/22] qlge: Add offload features to vlan interfaces This patch fills the net_device vlan_features with the proper hardware features, thus, improving the vlan interface performance. With the patch applied, I can see around 148% improvement on a TCP_STREAM test, from 3.5 Gb/s to 8.7 Gb/s. On TCP_RR, I see a 11% improvement, from 18k to 20. The CPU utilization is almost the same on both cases, from the comparison above. Signed-off-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 3769f5711cc3..b53a3b60b648 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -4682,6 +4682,7 @@ static int __devinit qlge_probe(struct pci_dev *pdev, NETIF_F_HW_VLAN_TX | NETIF_F_RXCSUM; ndev->features = ndev->hw_features | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER; + ndev->vlan_features = ndev->hw_features; if (test_bit(QL_DMA64, &qdev->flags)) ndev->features |= NETIF_F_HIGHDMA; From 4ea4bf7ebcbacee2f4736d261efb0693e87a34c9 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 29 Jul 2012 01:19:55 +0000 Subject: [PATCH 02/22] ipv4: fix debug info in tnode_new It should print size of struct rt_trie_node * allocated instead of size of struct rt_trie_node. Signed-off-by: Lin Ming Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 18cbc15b20d5..2a6fdc2708c6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -473,7 +473,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct rt_trie_node) << bits); + sizeof(struct rt_trie_node *) << bits); return tn; } From 61648d91fc278fd1d500da8061d17e6920cd3500 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 29 Jul 2012 02:00:03 +0000 Subject: [PATCH 03/22] ipv4: clean up put_child The first parameter struct trie *t is not used anymore. Remove it. Signed-off-by: Lin Ming Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 51 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2a6fdc2708c6..f0cdb30921c0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -159,7 +159,6 @@ struct trie { #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); @@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node * return ((struct tnode *) n)->pos == tn->pos + tn->bits; } -static inline void put_child(struct trie *t, struct tnode *tn, int i, +static inline void put_child(struct tnode *tn, int i, struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); @@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct rt_trie_node *) left); - put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); + put_child(tn, 2*i, (struct rt_trie_node *) left); + put_child(tn, 2*i+1, (struct rt_trie_node *) right); } } @@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 1) == 0) - put_child(t, tn, 2*i, node); + put_child(tn, 2*i, node); else - put_child(t, tn, 2*i+1, node); + put_child(tn, 2*i+1, node); continue; } @@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); - put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); + put_child(tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) */ left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + put_child(tn, 2*i, NULL); BUG_ON(!left); right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + put_child(tn, 2*i+1, NULL); BUG_ON(!right); size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, rtnl_dereference(inode->child[j])); - put_child(t, right, j, rtnl_dereference(inode->child[j + size])); + put_child(left, j, rtnl_dereference(inode->child[j])); + put_child(right, j, rtnl_dereference(inode->child[j + size])); } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + put_child(tn, 2*i, resize(t, left)); + put_child(tn, 2*i+1, resize(t, right)); tnode_free_safe(inode); } @@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct rt_trie_node *)newn); + put_child(tn, i/2, (struct rt_trie_node *)newn); } } @@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (left == NULL) { if (right == NULL) /* Both are empty */ continue; - put_child(t, tn, i/2, right); + put_child(tn, i/2, right); continue; } if (right == NULL) { - put_child(t, tn, i/2, left); + put_child(tn, i/2, left); continue; } /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); + put_child(tn, i/2, NULL); + put_child(newBinNode, 0, left); + put_child(newBinNode, 1, right); + put_child(tn, i/2, resize(t, newBinNode)); } tnode_free_safe(oldtnode); return tn; @@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)l); + put_child(tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct rt_trie_node *)l); - put_child(t, tn, 1-missbit, n); + put_child(tn, missbit, (struct rt_trie_node *)l); + put_child(tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, tp, cindex, (struct rt_trie_node *)tn); + put_child(tp, cindex, (struct rt_trie_node *)tn); } else { rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; @@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) if (tp) { t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(t, tp, cindex, NULL); + put_child(tp, cindex, NULL); trie_rebalance(t, tp); } else RCU_INIT_POINTER(t->trie, NULL); From ea4b3857861fe147137517da200d04cbbc91ad49 Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Sun, 29 Jul 2012 03:19:23 +0000 Subject: [PATCH 04/22] bnx2x: remove cast around the kmalloc in bnx2x_prev_mark_path casting the void pointer is redundant (Documentation/CodingStyle) Signed-off-by: Devendra Naga Acked-by: Eilon Greenstein Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 9aaf863b4237..dd451c3dd83d 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -9360,8 +9360,7 @@ static int __devinit bnx2x_prev_mark_path(struct bnx2x *bp) struct bnx2x_prev_path_list *tmp_list; int rc; - tmp_list = (struct bnx2x_prev_path_list *) - kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL); + tmp_list = kmalloc(sizeof(struct bnx2x_prev_path_list), GFP_KERNEL); if (!tmp_list) { BNX2X_ERR("Failed to allocate 'bnx2x_prev_path_list'\n"); return -ENOMEM; From 17a2bf798621ce8579f7563deaf4640f15931d0e Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Sun, 29 Jul 2012 03:28:47 +0000 Subject: [PATCH 05/22] seeq: use PTR_RET at init_module of driver the driver sees wether the dev_seeq pointer is having a error that can be read by using the PTR_ERR, and returns it at error case, other wise 0 at success case. the PTR_RET does the same thing, and use PTR_RET instead of redoing the code of PTR_RET Signed-off-by: Devendra Naga Acked-by: David Howells Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/seeq/seeq8005.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/seeq/seeq8005.c b/drivers/net/ethernet/seeq/seeq8005.c index 698edbbfc149..d6e50de71186 100644 --- a/drivers/net/ethernet/seeq/seeq8005.c +++ b/drivers/net/ethernet/seeq/seeq8005.c @@ -736,9 +736,7 @@ MODULE_PARM_DESC(irq, "SEEQ 8005 IRQ number"); int __init init_module(void) { dev_seeq = seeq8005_probe(-1); - if (IS_ERR(dev_seeq)) - return PTR_ERR(dev_seeq); - return 0; + return PTR_RET(dev_seeq); } void __exit cleanup_module(void) From b41a9a66f67817f8acd85bd650e012a14da39faa Mon Sep 17 00:00:00 2001 From: Karsten Keil Date: Sun, 29 Jul 2012 07:15:13 +0000 Subject: [PATCH 06/22] mISDN: Bugfix only few bytes are transfered on a connection The test for the fillempty condition was wrong in one place. Changed the variable to the right boolean type. Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/avmfritz.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c index c08fc605e56b..fa6ca4733725 100644 --- a/drivers/isdn/hardware/mISDN/avmfritz.c +++ b/drivers/isdn/hardware/mISDN/avmfritz.c @@ -449,7 +449,8 @@ hdlc_fill_fifo(struct bchannel *bch) { struct fritzcard *fc = bch->hw; struct hdlc_hw *hdlc; - int count, fs, cnt = 0, idx, fillempty = 0; + int count, fs, cnt = 0, idx; + bool fillempty = false; u8 *p; u32 *ptr, val, addr; @@ -462,7 +463,7 @@ hdlc_fill_fifo(struct bchannel *bch) return; count = fs; p = bch->fill; - fillempty = 1; + fillempty = true; } else { count = bch->tx_skb->len - bch->tx_idx; if (count <= 0) @@ -477,7 +478,7 @@ hdlc_fill_fifo(struct bchannel *bch) hdlc->ctrl.sr.cmd |= HDLC_CMD_XME; } ptr = (u32 *)p; - if (fillempty) { + if (!fillempty) { pr_debug("%s.B%d: %d/%d/%d", fc->name, bch->nr, count, bch->tx_idx, bch->tx_skb->len); bch->tx_idx += count; From 8253947e2cdfb14717c9212b751b7aec9ea9ef5e Mon Sep 17 00:00:00 2001 From: Li Wei Date: Sun, 29 Jul 2012 16:01:30 +0000 Subject: [PATCH 07/22] ipv6: fix incorrect route 'expires' value passed to userspace When userspace use RTM_GETROUTE to dump route table, with an already expired route entry, we always got an 'expires' value(2147157) calculated base on INT_MAX. The reason of this problem is in the following satement: rt->dst.expires - jiffies < INT_MAX gcc promoted the type of both sides of '<' to unsigned long, thus a small negative value would be considered greater than INT_MAX. With the help of Eric Dumazet, do the out of bound checks in rtnl_put_cacheinfo(), _after_ conversion to clock_t. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 ++++++-- net/ipv6/route.c | 8 ++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc9e380f0abf..5ff949dc954f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -625,9 +625,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_id = id, }; - if (expires) - ci.rta_expires = jiffies_to_clock_t(expires); + if (expires) { + unsigned long clock; + clock = jiffies_to_clock_t(abs(expires)); + clock = min_t(unsigned long, clock, INT_MAX); + ci.rta_expires = (expires > 0) ? clock : -clock; + } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cf02cb97bbdd..8e80fd279100 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2480,12 +2480,8 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) goto nla_put_failure; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - expires = 0; - else if (rt->dst.expires - jiffies < INT_MAX) - expires = rt->dst.expires - jiffies; - else - expires = INT_MAX; + + expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) goto nla_put_failure; From 8151ad576d34a5ec1f1068edf25f3b7c47f6edab Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:41 +0000 Subject: [PATCH 08/22] tg3: Request APE_LOCK_PHY before PHY access to prevent PHY access conflict with APE firmware. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 37 +++++++++++++++++++++++++++++ drivers/net/ethernet/broadcom/tg3.h | 1 + 2 files changed, 38 insertions(+) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 9a009fd6ea1b..a528f9a6a893 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -672,6 +672,12 @@ static int tg3_ape_lock(struct tg3 *tp, int locknum) else bit = 1 << tp->pci_fn; break; + case TG3_APE_LOCK_PHY0: + case TG3_APE_LOCK_PHY1: + case TG3_APE_LOCK_PHY2: + case TG3_APE_LOCK_PHY3: + bit = APE_LOCK_REQ_DRIVER; + break; default: return -EINVAL; } @@ -723,6 +729,12 @@ static void tg3_ape_unlock(struct tg3 *tp, int locknum) else bit = 1 << tp->pci_fn; break; + case TG3_APE_LOCK_PHY0: + case TG3_APE_LOCK_PHY1: + case TG3_APE_LOCK_PHY2: + case TG3_APE_LOCK_PHY3: + bit = APE_LOCK_GRANT_DRIVER; + break; default: return; } @@ -1052,6 +1064,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val) udelay(80); } + tg3_ape_lock(tp, tp->phy_ape_lock); + *val = 0x0; frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) & @@ -1086,6 +1100,8 @@ static int tg3_readphy(struct tg3 *tp, int reg, u32 *val) udelay(80); } + tg3_ape_unlock(tp, tp->phy_ape_lock); + return ret; } @@ -1105,6 +1121,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val) udelay(80); } + tg3_ape_lock(tp, tp->phy_ape_lock); + frame_val = ((tp->phy_addr << MI_COM_PHY_ADDR_SHIFT) & MI_COM_PHY_ADDR_MASK); frame_val |= ((reg << MI_COM_REG_ADDR_SHIFT) & @@ -1135,6 +1153,8 @@ static int tg3_writephy(struct tg3 *tp, int reg, u32 val) udelay(80); } + tg3_ape_unlock(tp, tp->phy_ape_lock); + return ret; } @@ -13648,6 +13668,23 @@ static int __devinit tg3_phy_probe(struct tg3 *tp) tg3_flag_set(tp, PAUSE_AUTONEG); tp->link_config.flowctrl = FLOW_CTRL_TX | FLOW_CTRL_RX; + if (tg3_flag(tp, ENABLE_APE)) { + switch (tp->pci_fn) { + case 0: + tp->phy_ape_lock = TG3_APE_LOCK_PHY0; + break; + case 1: + tp->phy_ape_lock = TG3_APE_LOCK_PHY1; + break; + case 2: + tp->phy_ape_lock = TG3_APE_LOCK_PHY2; + break; + case 3: + tp->phy_ape_lock = TG3_APE_LOCK_PHY3; + break; + } + } + if (tg3_flag(tp, USE_PHYLIB)) return tg3_phy_init(tp); diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index a1b75cd67b9d..6fb45a500032 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -3107,6 +3107,7 @@ struct tg3 { int old_link; u8 phy_addr; + u8 phy_ape_lock; /* PHY info */ u32 phy_id; From 10ce95d6ef36c65df7dcd3b8fcf86913f8b298bd Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:42 +0000 Subject: [PATCH 09/22] tg3: Fix Read DMA workaround for 5719 A0. The workaround was mis-applied to all 5719 and 5720 chips. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index a528f9a6a893..f0434dfbda9c 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -9086,8 +9086,7 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_57780 || tg3_flag(tp, 57765_PLUS)) { val = tr32(TG3_RDMA_RSRVCTRL_REG); - if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719 || - GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5720) { + if (tp->pci_chip_rev_id == CHIPREV_ID_5719_A0) { val &= ~(TG3_RDMA_RSRVCTRL_TXMRGN_MASK | TG3_RDMA_RSRVCTRL_FIFO_LWM_MASK | TG3_RDMA_RSRVCTRL_FIFO_HWM_MASK); From 091f0ea30074bc43f9250961b3247af713024bc6 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:43 +0000 Subject: [PATCH 10/22] tg3: Add New 5719 Read DMA workaround After Power-on-reset, the 5719's TX DMA length registers may contain uninitialized values and cause TX DMA to stall. Check for invalid values and set a register bit to flush the TX channels. The bit needs to be turned off after the DMA channels have been flushed. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 23 +++++++++++++++++++++++ drivers/net/ethernet/broadcom/tg3.h | 7 ++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f0434dfbda9c..50045ed1e8c0 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -9276,6 +9276,19 @@ static int tg3_reset_hw(struct tg3 *tp, int reset_phy) tw32_f(RDMAC_MODE, rdmac_mode); udelay(40); + if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5719) { + for (i = 0; i < TG3_NUM_RDMA_CHANNELS; i++) { + if (tr32(TG3_RDMA_LENGTH + (i << 2)) > TG3_MAX_MTU(tp)) + break; + } + if (i < TG3_NUM_RDMA_CHANNELS) { + val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL); + val |= TG3_LSO_RD_DMA_TX_LENGTH_WA; + tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val); + tg3_flag_set(tp, 5719_RDMA_BUG); + } + } + tw32(RCVDCC_MODE, RCVDCC_MODE_ENABLE | RCVDCC_MODE_ATTN_ENABLE); if (!tg3_flag(tp, 5705_PLUS)) tw32(MBFREE_MODE, MBFREE_MODE_ENABLE); @@ -9635,6 +9648,16 @@ static void tg3_periodic_fetch_stats(struct tg3 *tp) TG3_STAT_ADD32(&sp->tx_ucast_packets, MAC_TX_STATS_UCAST); TG3_STAT_ADD32(&sp->tx_mcast_packets, MAC_TX_STATS_MCAST); TG3_STAT_ADD32(&sp->tx_bcast_packets, MAC_TX_STATS_BCAST); + if (unlikely(tg3_flag(tp, 5719_RDMA_BUG) && + (sp->tx_ucast_packets.low + sp->tx_mcast_packets.low + + sp->tx_bcast_packets.low) > TG3_NUM_RDMA_CHANNELS)) { + u32 val; + + val = tr32(TG3_LSO_RD_DMA_CRPTEN_CTRL); + val &= ~TG3_LSO_RD_DMA_TX_LENGTH_WA; + tw32(TG3_LSO_RD_DMA_CRPTEN_CTRL, val); + tg3_flag_clear(tp, 5719_RDMA_BUG); + } TG3_STAT_ADD32(&sp->rx_octets, MAC_RX_STATS_OCTETS); TG3_STAT_ADD32(&sp->rx_fragments, MAC_RX_STATS_FRAGMENTS); diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h index 6fb45a500032..6d52cb286826 100644 --- a/drivers/net/ethernet/broadcom/tg3.h +++ b/drivers/net/ethernet/broadcom/tg3.h @@ -1376,7 +1376,11 @@ #define TG3_LSO_RD_DMA_CRPTEN_CTRL 0x00004910 #define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_BD_4K 0x00030000 #define TG3_LSO_RD_DMA_CRPTEN_CTRL_BLEN_LSO_4K 0x000c0000 -/* 0x4914 --> 0x4c00 unused */ +#define TG3_LSO_RD_DMA_TX_LENGTH_WA 0x02000000 +/* 0x4914 --> 0x4be0 unused */ + +#define TG3_NUM_RDMA_CHANNELS 4 +#define TG3_RDMA_LENGTH 0x00004be0 /* Write DMA control registers */ #define WDMAC_MODE 0x00004c00 @@ -2959,6 +2963,7 @@ enum TG3_FLAGS { TG3_FLAG_L1PLLPD_EN, TG3_FLAG_APE_HAS_NCSI, TG3_FLAG_4K_FIFO_LIMIT, + TG3_FLAG_5719_RDMA_BUG, TG3_FLAG_RESET_TASK_PENDING, TG3_FLAG_5705_PLUS, TG3_FLAG_IS_5788, From 0f566b208b41918053b2e67399673aaec02dde5d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:44 +0000 Subject: [PATCH 11/22] tg3: Fix race condition in tg3_get_stats64() Spinlock should be taken before checking for tp->hw_stats. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 50045ed1e8c0..f03614be9fd5 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -12524,10 +12524,12 @@ static struct rtnl_link_stats64 *tg3_get_stats64(struct net_device *dev, { struct tg3 *tp = netdev_priv(dev); - if (!tp->hw_stats) - return &tp->net_stats_prev; - spin_lock_bh(&tp->lock); + if (!tp->hw_stats) { + spin_unlock_bh(&tp->lock); + return &tp->net_stats_prev; + } + tg3_get_nstats(tp, stats); spin_unlock_bh(&tp->lock); From cac83e53917ebc058066eb98023c11fdaa2262dc Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 29 Jul 2012 19:15:45 +0000 Subject: [PATCH 12/22] tg3: Update version to 3.124 Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f03614be9fd5..bf906c51d82a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -92,7 +92,7 @@ static inline void _tg3_flag_clear(enum TG3_FLAGS flag, unsigned long *bits) #define DRV_MODULE_NAME "tg3" #define TG3_MAJ_NUM 3 -#define TG3_MIN_NUM 123 +#define TG3_MIN_NUM 124 #define DRV_MODULE_VERSION \ __stringify(TG3_MAJ_NUM) "." __stringify(TG3_MIN_NUM) #define DRV_MODULE_RELDATE "March 21, 2012" From a117dacde0288f3ec60b6e5bcedae8fa37ee0dfc Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 29 Jul 2012 19:45:14 +0000 Subject: [PATCH 13/22] net/tun: fix ioctl() based info leaks The tun module leaks up to 36 bytes of memory by not fully initializing a structure located on the stack that gets copied to user memory by the TUNGETIFF and SIOCGIFHWADDR ioctl()s. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index c62163e272cd..f55c46222613 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1379,9 +1379,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, int vnet_hdr_sz; int ret; - if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) + if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; + } else + memset(&ifr, 0, sizeof(ifr)); if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". From 8bbb181308bc348e02bfdbebdedd4e4ec9d452ce Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 30 Jul 2012 14:52:48 -0700 Subject: [PATCH 14/22] tun: Fix formatting. Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f55c46222613..926d4db5cb38 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1382,9 +1382,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) { if (copy_from_user(&ifr, argp, ifreq_len)) return -EFAULT; - } else + } else { memset(&ifr, 0, sizeof(ifr)); - + } if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on From cca32e4bf999a34ac08d959f351f2b30bcd02460 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 21:06:13 +0000 Subject: [PATCH 15/22] net: TCP early demux cleanup early_demux() handlers should be called in RCU context, and as we use skb_dst_set_noref(skb, dst), caller must not exit from RCU context before dst use (skb_dst(skb)) or release (skb_drop(dst)) Therefore, rcu_read_lock()/rcu_read_unlock() pairs around ->early_demux() are confusing and not needed : Protocol handlers are already in an RCU read lock section. (__netif_receive_skb() does the rcu_read_lock() ) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 -- net/ipv6/ip6_input.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 981ff1eef28c..f1395a6fb35f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct net_protocol *ipprot; int protocol = iph->protocol; - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } - rcu_read_unlock(); } /* diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 47975e363fcd..a52d864d562b 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -52,11 +52,9 @@ int ip6_rcv_finish(struct sk_buff *skb) if (sysctl_ip_early_demux && !skb_dst(skb)) { const struct inet6_protocol *ipprot; - rcu_read_lock(); ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); if (ipprot && ipprot->early_demux) ipprot->early_demux(skb); - rcu_read_unlock(); } if (!skb_dst(skb)) ip6_route_input(skb); From 404e0a8b6a55d5e1cd138c6deb1bca9abdf75d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 29 Jul 2012 23:20:37 +0000 Subject: [PATCH 16/22] net: ipv4: fix RCU races on dst refcounts commit c6cffba4ffa2 (ipv4: Fix input route performance regression.) added various fatal races with dst refcounts. crashes happen on tcp workloads if routes are added/deleted at the same time. The dst_free() calls from free_fib_info_rcu() are clearly racy. We need instead regular dst refcounting (dst_release()) and make sure dst_release() is aware of RCU grace periods : Add DST_RCU_FREE flag so that dst_release() respects an RCU grace period before dst destruction for cached dst Introduce a new inet_sk_rx_dst_set() helper, using atomic_inc_not_zero() to make sure we dont increase a zero refcount (On a dst currently waiting an rcu grace period before destruction) rt_cache_route() must take a reference on the new cached route, and release it if was not able to install it. With this patch, my machines survive various benchmarks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 +------ include/net/inet_sock.h | 13 +++++++++++++ net/core/dst.c | 26 +++++++++++++++++++++----- net/decnet/dn_route.c | 6 ++++++ net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/route.c | 16 ++++------------ net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_minisocks.c | 3 +-- 9 files changed, 55 insertions(+), 35 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index baf597890064..31a9fd39edb6 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,6 +61,7 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 +#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -382,12 +383,6 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 613cfa401672..e3fd34c83ac9 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,4 +249,17 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } +static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + if (atomic_inc_not_zero(&dst->__refcnt)) { + if (!(dst->flags & DST_RCU_FREE)) + dst->flags |= DST_RCU_FREE; + + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + } +} + #endif /* _INET_SOCK_H */ diff --git a/net/core/dst.c b/net/core/dst.c index 069d51d29414..d9e33ebe170f 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,6 +258,15 @@ again: } EXPORT_SYMBOL(dst_destroy); +static void dst_rcu_destroy(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); +} + void dst_release(struct dst_entry *dst) { if (dst) { @@ -265,10 +274,14 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); + if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { + if (dst->flags & DST_RCU_FREE) { + call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); + } else { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } } @@ -320,11 +333,14 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + bool hold; + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - if (unlikely(dst->flags & DST_NOCACHE)) { + hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; + if (unlikely(hold)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 85a3604c87c8..26719779ad8e 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,6 +184,12 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da0cc2e6b250..e55171f184f9 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) - dst_free(&nexthop_nh->nh_rth_output->dst); + dst_release(&nexthop_nh->nh_rth_output->dst); if (nexthop_nh->nh_rth_input) - dst_free(&nexthop_nh->nh_rth_input->dst); + dst_release(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fc1a81ca79a7..d6eabcfe8a90 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,11 +1199,6 @@ restart: fnhe->fnhe_stamp = jiffies; } -static inline void rt_free(struct rtable *rt) -{ - call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); -} - static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p = &nh->nh_rth_output; @@ -1213,17 +1208,14 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) orig = *p; + rt->dst.flags |= DST_RCU_FREE; + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - rt_free(orig); + dst_release(&orig->dst); } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + dst_release(&rt->dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1fecf9a..9be30b039ae3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5604,8 +5604,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fbd9921253f..7f91e5ac8277 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) #endif if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); - if (sk->sk_rx_dst) { - struct dst_entry *dst = sk->sk_rx_dst; + if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, 0) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) { - sk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + if (unlikely(sk->sk_rx_dst == NULL)) + inet_sk_rx_dst_set(sk, skb); + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3f1cc2028edd..232a90c3ec86 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newsk->sk_rx_dst = dst_clone(skb_dst(skb)); - inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif; + inet_sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to From 0c7462a2351b4cc502f326aad7fedd04909928be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jul 2012 07:14:29 +0000 Subject: [PATCH 17/22] ipv4: remove rt_cache_rebuild_count After IP route cache removal, rt_cache_rebuild_count is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 6 ------ include/net/netns/ipv4.h | 2 -- net/ipv4/sysctl_net_ipv4.c | 11 ----------- 3 files changed, 19 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 406a5226220d..ca447b35b833 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -48,12 +48,6 @@ min_adv_mss - INTEGER The advertised MSS depends on the first hop route MTU, but will never be lower than this setting. -rt_cache_rebuild_count - INTEGER - The per net-namespace route cache emergency rebuild threshold. - Any net-namespace having its route cache rebuilt due to - a hash bucket chain being too long more than this many times - will have its route caching disabled - IP Fragmentation: ipfrag_high_thresh - INTEGER diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 0ffb8e31f3cd..1474dd65c66f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -61,8 +61,6 @@ struct netns_ipv4 { int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_errors_use_inbound_ifaddr; - int sysctl_rt_cache_rebuild_count; - int current_rt_cache_rebuild_count; unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c3255721..4b6487a68279 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -783,13 +783,6 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "rt_cache_rebuild_count", - .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "ping_group_range", .data = &init_net.ipv4.sysctl_ping_group_range, @@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[5].data = &net->ipv4.sysctl_icmp_ratemask; table[6].data = - &net->ipv4.sysctl_rt_cache_rebuild_count; - table[7].data = &net->ipv4.sysctl_ping_group_range; } @@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = 1; net->ipv4.sysctl_ping_group_range[1] = 0; - net->ipv4.sysctl_rt_cache_rebuild_count = 4; - tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); From 5a0d513b622ee41e117fc37e26e27e8ef42e8dae Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 30 Jul 2012 08:55:49 +0000 Subject: [PATCH 18/22] bridge: make port attributes const Simple table that can be marked const. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_sysfs_if.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 6229b62749e8..13b36bdc76a7 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -27,7 +27,7 @@ struct brport_attribute { }; #define BRPORT_ATTR(_name,_mode,_show,_store) \ -struct brport_attribute brport_attr_##_name = { \ +const struct brport_attribute brport_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = _mode }, \ .show = _show, \ @@ -164,7 +164,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, store_multicast_router); #endif -static struct brport_attribute *brport_attrs[] = { +static const struct brport_attribute *brport_attrs[] = { &brport_attr_path_cost, &brport_attr_priority, &brport_attr_port_id, @@ -241,7 +241,7 @@ const struct sysfs_ops brport_sysfs_ops = { int br_sysfs_addif(struct net_bridge_port *p) { struct net_bridge *br = p->br; - struct brport_attribute **a; + const struct brport_attribute **a; int err; err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, From 54764bb647b2e847c512acf8d443df965da35000 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 01:08:23 +0000 Subject: [PATCH 19/22] ipv4: Restore old dst_free() behavior. commit 404e0a8b6a55 (net: ipv4: fix RCU races on dst refcounts) tried to solve a race but added a problem at device/fib dismantle time : We really want to call dst_free() as soon as possible, even if sockets still have dst in their cache. dst_release() calls in free_fib_info_rcu() are not welcomed. Root of the problem was that now we also cache output routes (in nh_rth_output), we must use call_rcu() instead of call_rcu_bh() in rt_free(), because output route lookups are done in process context. Based on feedback and initial patch from David Miller (adding another call_rcu_bh() call in fib, but it appears it was not the right fix) I left the inet_sk_rx_dst_set() helper and added __rcu attributes to nh_rth_output and nh_rth_input to better document what is going on in this code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 7 ++++++- include/net/inet_sock.h | 10 +++------- include/net/ip_fib.h | 4 ++-- net/core/dst.c | 26 +++++--------------------- net/decnet/dn_route.c | 6 ------ net/ipv4/fib_semantics.c | 21 +++++++++++++++++---- net/ipv4/route.c | 26 +++++++++++++++++--------- 7 files changed, 50 insertions(+), 50 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 31a9fd39edb6..baf597890064 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -61,7 +61,6 @@ struct dst_entry { #define DST_NOPEER 0x0040 #define DST_FAKE_RTABLE 0x0080 #define DST_XFRM_TUNNEL 0x0100 -#define DST_RCU_FREE 0x0200 unsigned short pending_confirm; @@ -383,6 +382,12 @@ static inline void dst_free(struct dst_entry *dst) __dst_free(dst); } +static inline void dst_rcu_free(struct rcu_head *head) +{ + struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); + dst_free(dst); +} + static inline void dst_confirm(struct dst_entry *dst) { dst->pending_confirm = 1; diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index e3fd34c83ac9..83b567fe1941 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -253,13 +253,9 @@ static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb { struct dst_entry *dst = skb_dst(skb); - if (atomic_inc_not_zero(&dst->__refcnt)) { - if (!(dst->flags & DST_RCU_FREE)) - dst->flags |= DST_RCU_FREE; - - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - } + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } #endif /* _INET_SOCK_H */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e69c3a47153d..e521a03515b1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -81,8 +81,8 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable *nh_rth_output; - struct rtable *nh_rth_input; + struct rtable __rcu *nh_rth_output; + struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/core/dst.c b/net/core/dst.c index d9e33ebe170f..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -258,15 +258,6 @@ again: } EXPORT_SYMBOL(dst_destroy); -static void dst_rcu_destroy(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); -} - void dst_release(struct dst_entry *dst) { if (dst) { @@ -274,14 +265,10 @@ void dst_release(struct dst_entry *dst) newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); - if (unlikely(dst->flags & (DST_NOCACHE | DST_RCU_FREE)) && !newrefcnt) { - if (dst->flags & DST_RCU_FREE) { - call_rcu_bh(&dst->rcu_head, dst_rcu_destroy); - } else { - dst = dst_destroy(dst); - if (dst) - __dst_free(dst); - } + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); } } } @@ -333,14 +320,11 @@ EXPORT_SYMBOL(__dst_destroy_metrics_generic); */ void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { - bool hold; - WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); /* If dst not in cache, we must take a reference, because * dst_release() will destroy dst as soon as its refcount becomes zero */ - hold = (dst->flags & (DST_NOCACHE | DST_RCU_FREE)) == DST_NOCACHE; - if (unlikely(hold)) { + if (unlikely(dst->flags & DST_NOCACHE)) { dst_hold(dst); skb_dst_set(skb, dst); } else { diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 26719779ad8e..85a3604c87c8 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -184,12 +184,6 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst) return dn_rt_hash_mask & (unsigned int)tmp; } -static inline void dst_rcu_free(struct rcu_head *head) -{ - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_free(dst); -} - static inline void dnrt_free(struct dn_route *rt) { call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f184f9..625cf185c489 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -161,6 +161,21 @@ static void free_nh_exceptions(struct fib_nh *nh) kfree(hash); } +static void rt_nexthop_free(struct rtable __rcu **rtp) +{ + struct rtable *rt = rcu_dereference_protected(*rtp, 1); + + if (!rt) + return; + + /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); + * because we waited an RCU grace period before calling + * free_fib_info_rcu() + */ + + dst_free(&rt->dst); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -171,10 +186,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - if (nexthop_nh->nh_rth_output) - dst_release(&nexthop_nh->nh_rth_output->dst); - if (nexthop_nh->nh_rth_input) - dst_release(&nexthop_nh->nh_rth_input->dst); + rt_nexthop_free(&nexthop_nh->nh_rth_output); + rt_nexthop_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d6eabcfe8a90..2bd107477469 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,23 +1199,31 @@ restart: fnhe->fnhe_stamp = jiffies; } +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = &nh->nh_rth_output; + struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; if (rt_is_input_route(rt)) - p = &nh->nh_rth_input; + p = (struct rtable **)&nh->nh_rth_input; orig = *p; - rt->dst.flags |= DST_RCU_FREE; - dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) - dst_release(&orig->dst); + rt_free(orig); } else { - dst_release(&rt->dst); + /* Routes we intend to cache in the FIB nexthop have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; } } @@ -1412,7 +1420,7 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = false; if (res->fi) { if (!itag) { - rth = FIB_RES_NH(*res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; @@ -1574,7 +1582,7 @@ local_input: do_cache = false; if (res.fi) { if (!itag) { - rth = FIB_RES_NH(res).nh_rth_input; + rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1742,7 +1750,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, if (fi) { fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); if (!fnhe) { - rth = FIB_RES_NH(*res).nh_rth_output; + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; From d26b3a7c4b3b26319f18bb645de93eba8f4bdcd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Jul 2012 05:45:30 +0000 Subject: [PATCH 20/22] ipv4: percpu nh_rth_output cache Input path is mostly run under RCU and doesnt touch dst refcnt But output path on forwarding or UDP workloads hits badly dst refcount, and we have lot of false sharing, for example in ipv4_mtu() when reading rt->rt_pmtu Using a percpu cache for nh_rth_output gives a nice performance increase at a small cost. 24 udpflood test on my 24 cpu machine (dummy0 output device) (each process sends 1.000.000 udp frames, 24 processes are started) before : 5.24 s after : 2.06 s For reference, time on linux-3.5 : 6.60 s Signed-off-by: Eric Dumazet Tested-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_semantics.c | 20 +++++++++++++++++++- net/ipv4/route.c | 18 +++++++++++++----- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e521a03515b1..e331746029b4 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -21,6 +21,7 @@ #include #include #include +#include struct fib_config { u8 fc_dst_len; @@ -81,7 +82,7 @@ struct fib_nh { __be32 nh_gw; __be32 nh_saddr; int nh_saddr_genid; - struct rtable __rcu *nh_rth_output; + struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket *nh_exceptions; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 625cf185c489..fe2ca02a1979 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -176,6 +176,23 @@ static void rt_nexthop_free(struct rtable __rcu **rtp) dst_free(&rt->dst); } +static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp) +{ + int cpu; + + if (!rtp) + return; + + for_each_possible_cpu(cpu) { + struct rtable *rt; + + rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); + if (rt) + dst_free(&rt->dst); + } + free_percpu(rtp); +} + /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { @@ -186,7 +203,7 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - rt_nexthop_free(&nexthop_nh->nh_rth_output); + rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_nexthop_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); @@ -817,6 +834,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { nexthop_nh->nh_parent = fi; + nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2bd107477469..4f6276ce0af3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1206,11 +1206,15 @@ static inline void rt_free(struct rtable *rt) static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) { - struct rtable *orig, *prev, **p = (struct rtable **)&nh->nh_rth_output; + struct rtable *orig, *prev, **p; - if (rt_is_input_route(rt)) + if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; - + } else { + if (!nh->nh_pcpu_rth_output) + goto nocache; + p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); + } orig = *p; prev = cmpxchg(p, orig, rt); @@ -1223,6 +1227,7 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) * unsuccessful at storing this route into the cache * we really need to set it. */ +nocache: rt->dst.flags |= DST_NOCACHE; } } @@ -1749,8 +1754,11 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; if (fi) { fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); - if (!fnhe) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_output); + if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) { + struct rtable __rcu **prth; + + prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + rth = rcu_dereference(*prth); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; From c5038a8327b980a5b279fa193163c468011de009 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:02:02 -0700 Subject: [PATCH 21/22] ipv4: Cache routes in nexthop exception entries. Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + net/ipv4/fib_semantics.c | 53 ++++++++++---------- net/ipv4/route.c | 101 ++++++++++++++++++++++----------------- 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e331746029b4..926142ed8d7a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -55,6 +55,7 @@ struct fib_nh_exception { u32 fnhe_pmtu; __be32 fnhe_gw; unsigned long fnhe_expires; + struct rtable __rcu *fnhe_rth; unsigned long fnhe_stamp; }; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index fe2ca02a1979..da80dc14cc76 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -140,28 +140,7 @@ const struct fib_prop fib_props[RTN_MAX + 1] = { }, }; -static void free_nh_exceptions(struct fib_nh *nh) -{ - struct fnhe_hash_bucket *hash = nh->nh_exceptions; - int i; - - for (i = 0; i < FNHE_HASH_SIZE; i++) { - struct fib_nh_exception *fnhe; - - fnhe = rcu_dereference_protected(hash[i].chain, 1); - while (fnhe) { - struct fib_nh_exception *next; - - next = rcu_dereference_protected(fnhe->fnhe_next, 1); - kfree(fnhe); - - fnhe = next; - } - } - kfree(hash); -} - -static void rt_nexthop_free(struct rtable __rcu **rtp) +static void rt_fibinfo_free(struct rtable __rcu **rtp) { struct rtable *rt = rcu_dereference_protected(*rtp, 1); @@ -176,7 +155,31 @@ static void rt_nexthop_free(struct rtable __rcu **rtp) dst_free(&rt->dst); } -static void rt_nexthop_free_cpus(struct rtable __rcu * __percpu *rtp) +static void free_nh_exceptions(struct fib_nh *nh) +{ + struct fnhe_hash_bucket *hash = nh->nh_exceptions; + int i; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + fnhe = rcu_dereference_protected(hash[i].chain, 1); + while (fnhe) { + struct fib_nh_exception *next; + + next = rcu_dereference_protected(fnhe->fnhe_next, 1); + + rt_fibinfo_free(&fnhe->fnhe_rth); + + kfree(fnhe); + + fnhe = next; + } + } + kfree(hash); +} + +static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; @@ -203,8 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head) dev_put(nexthop_nh->nh_dev); if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); - rt_nexthop_free_cpus(nexthop_nh->nh_pcpu_rth_output); - rt_nexthop_free(&nexthop_nh->nh_rth_input); + rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); + rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4f6276ce0af3..b102eeb16e34 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -587,11 +587,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static DEFINE_SEQLOCK(fnhe_seqlock); +static inline void rt_free(struct rtable *rt) +{ + call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} + +static DEFINE_SPINLOCK(fnhe_lock); static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception *fnhe, *oldest; + struct rtable *orig; oldest = rcu_dereference(hash->chain); for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; @@ -599,6 +605,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) oldest = fnhe; } + orig = rcu_dereference(oldest->fnhe_rth); + if (orig) { + RCU_INIT_POINTER(oldest->fnhe_rth, NULL); + rt_free(orig); + } return oldest; } @@ -620,7 +631,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, int depth; u32 hval = fnhe_hashfun(daddr); - write_seqlock_bh(&fnhe_seqlock); + spin_lock_bh(&fnhe_lock); hash = nh->nh_exceptions; if (!hash) { @@ -667,7 +678,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, fnhe->fnhe_stamp = jiffies; out_unlock: - write_sequnlock_bh(&fnhe_seqlock); + spin_unlock_bh(&fnhe_lock); return; } @@ -1167,41 +1178,40 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr) { - __be32 fnhe_daddr, gw; - unsigned long expires; - unsigned int seq; - u32 pmtu; + spin_lock_bh(&fnhe_lock); -restart: - seq = read_seqbegin(&fnhe_seqlock); - fnhe_daddr = fnhe->fnhe_daddr; - gw = fnhe->fnhe_gw; - pmtu = fnhe->fnhe_pmtu; - expires = fnhe->fnhe_expires; - if (read_seqretry(&fnhe_seqlock, seq)) - goto restart; + if (daddr == fnhe->fnhe_daddr) { + struct rtable *orig; - if (daddr != fnhe_daddr) - return; + if (fnhe->fnhe_pmtu) { + unsigned long expires = fnhe->fnhe_expires; + unsigned long diff = expires - jiffies; - if (pmtu) { - unsigned long diff = expires - jiffies; - - if (time_before(jiffies, expires)) { - rt->rt_pmtu = pmtu; - dst_set_expires(&rt->dst, diff); + if (time_before(jiffies, expires)) { + rt->rt_pmtu = fnhe->fnhe_pmtu; + dst_set_expires(&rt->dst, diff); + } + } + if (fnhe->fnhe_gw) { + rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_gateway = fnhe->fnhe_gw; } - } - if (gw) { - rt->rt_flags |= RTCF_REDIRECTED; - rt->rt_gateway = gw; - } - fnhe->fnhe_stamp = jiffies; -} -static inline void rt_free(struct rtable *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); + orig = rcu_dereference(fnhe->fnhe_rth); + rcu_assign_pointer(fnhe->fnhe_rth, rt); + if (orig) + rt_free(orig); + + fnhe->fnhe_stamp = jiffies; + } else { + /* Routes we intend to cache in nexthop exception have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + } + spin_unlock_bh(&fnhe_lock); } static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) @@ -1249,13 +1259,13 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) rt->rt_gateway = nh->nh_gw; - if (unlikely(fnhe)) - rt_bind_exception(rt, fnhe, daddr); dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_NOCACHE)) + if (unlikely(fnhe)) + rt_bind_exception(rt, fnhe, daddr); + else if (!(rt->dst.flags & DST_NOCACHE)) rt_cache_route(nh, rt); } @@ -1753,22 +1763,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res, fnhe = NULL; if (fi) { - fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); - if (!fnhe && FIB_RES_NH(*res).nh_pcpu_rth_output) { - struct rtable __rcu **prth; + struct rtable __rcu **prth; + fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + if (fnhe) + prth = &fnhe->fnhe_rth; + else prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); - rth = rcu_dereference(*prth); - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); - return rth; - } + rth = rcu_dereference(*prth); + if (rt_cache_valid(rth)) { + dst_hold(&rth->dst); + return rth; } } rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi && !fnhe); + fi); if (!rth) return ERR_PTR(-ENOBUFS); From caacf05e5ad1abf0a2864863da4e33024bc68ec6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 Jul 2012 15:06:50 -0700 Subject: [PATCH 22/22] ipv4: Properly purge netdev references on uncached routes. When a device is unregistered, we have to purge all of the references to it that may exist in the entire system. If a route is uncached, we currently have no way of accomplishing this. So create a global list that is scanned when a network device goes down. This mirrors the logic in net/core/dst.c's dst_ifdown(). Signed-off-by: David S. Miller --- include/net/route.h | 3 ++ net/ipv4/fib_frontend.c | 1 + net/ipv4/route.c | 68 ++++++++++++++++++++++++++++++++++++++--- net/ipv4/xfrm4_policy.c | 1 + 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 8c52bc6f1c90..776a27f1ab78 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -57,6 +57,8 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; + + struct list_head rt_uncached; }; static inline bool rt_is_input_route(const struct rtable *rt) @@ -107,6 +109,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); extern void rt_cache_flush(struct net *net, int how); +extern void rt_flush_dev(struct net_device *dev); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8732cc7920ed..c43ae3fba792 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2, -1); + rt_flush_dev(dev); return NOTIFY_DONE; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b102eeb16e34..c035251beb07 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -147,6 +147,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void ipv4_dst_destroy(struct dst_entry *dst); static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how) @@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = { .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, + .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, @@ -1175,9 +1177,11 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) return NULL; } -static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, +static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr) { + bool ret = false; + spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { @@ -1203,6 +1207,7 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, rt_free(orig); fnhe->fnhe_stamp = jiffies; + ret = true; } else { /* Routes we intend to cache in nexthop exception have * the DST_NOCACHE bit clear. However, if we are @@ -1212,11 +1217,14 @@ static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, rt->dst.flags |= DST_NOCACHE; } spin_unlock_bh(&fnhe_lock); + + return ret; } -static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) +static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) { struct rtable *orig, *prev, **p; + bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; @@ -1239,6 +1247,48 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) */ nocache: rt->dst.flags |= DST_NOCACHE; + ret = false; + } + + return ret; +} + +static DEFINE_SPINLOCK(rt_uncached_lock); +static LIST_HEAD(rt_uncached_list); + +static void rt_add_uncached_list(struct rtable *rt) +{ + spin_lock_bh(&rt_uncached_lock); + list_add_tail(&rt->rt_uncached, &rt_uncached_list); + spin_unlock_bh(&rt_uncached_lock); +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + + if (dst->flags & DST_NOCACHE) { + spin_lock_bh(&rt_uncached_lock); + list_del(&rt->rt_uncached); + spin_unlock_bh(&rt_uncached_lock); + } +} + +void rt_flush_dev(struct net_device *dev) +{ + if (!list_empty(&rt_uncached_list)) { + struct net *net = dev_net(dev); + struct rtable *rt; + + spin_lock_bh(&rt_uncached_lock); + list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { + if (rt->dst.dev != dev) + continue; + rt->dst.dev = net->loopback_dev; + dev_hold(rt->dst.dev); + dev_put(dev); + } + spin_unlock_bh(&rt_uncached_lock); } } @@ -1254,6 +1304,8 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag) { + bool cached = false; + if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); @@ -1264,10 +1316,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->dst.tclassid = nh->nh_tclassid; #endif if (unlikely(fnhe)) - rt_bind_exception(rt, fnhe, daddr); + cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) - rt_cache_route(nh, rt); + cached = rt_cache_route(nh, rt); } + if (unlikely(!cached)) + rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1334,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1459,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1625,6 +1681,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; @@ -1792,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -2071,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + INIT_LIST_HEAD(&rt->rt_uncached); + dst_free(new); } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c6281847f16a..681ea2f413e2 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; + INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; }