From a8a572a6b5f2a79280d6e302cb3c1cb1fbaeb3e8 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 29 Oct 2015 09:51:16 -0400 Subject: [PATCH 01/67] xfrm: dst_entries_init() per-net dst_ops Remove the dst_entries_init/destroy calls for xfrm4 and xfrm6 dst_ops templates; their dst_entries counters will never be used. Move the xfrm dst_ops initialization from the common xfrm/xfrm_policy.c to xfrm4/xfrm4_policy.c and xfrm6/xfrm6_policy.c, and call dst_entries_init and dst_entries_destroy for each net namespace. The ipv4 and ipv6 xfrms each create dst_ops template, and perform dst_entries_init on the templates. The template values are copied to each net namespace's xfrm.xfrm*_dst_ops. The problem there is the dst_ops pcpuc_entries field is a percpu counter and cannot be used correctly by simply copying it to another object. The result of this is a very subtle bug; changes to the dst entries counter from one net namespace may sometimes get applied to a different net namespace dst entries counter. This is because of how the percpu counter works; it has a main count field as well as a pointer to the percpu variables. Each net namespace maintains its own main count variable, but all point to one set of percpu variables. When any net namespace happens to change one of the percpu variables to outside its small batch range, its count is moved to the net namespace's main count variable. So with multiple net namespaces operating concurrently, the dst_ops entries counter can stray from the actual value that it should be; if counts are consistently moved from one net namespace to another (which my testing showed is likely), then one net namespace winds up with a negative dst_ops count while another winds up with a continually increasing count, eventually reaching its gc_thresh limit, which causes all new traffic on the net namespace to fail with -ENOBUFS. Signed-off-by: Dan Streetman Signed-off-by: Dan Streetman Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 46 ++++++++++++++++++++++++++++------- net/ipv6/xfrm6_policy.c | 53 +++++++++++++++++++++++++++++------------ net/xfrm/xfrm_policy.c | 38 ----------------------------- 3 files changed, 75 insertions(+), 62 deletions(-) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index c10a9ee68433..126ff9020bad 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -236,7 +236,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm4_dst_ops = { +static struct dst_ops xfrm4_dst_ops_template = { .family = AF_INET, .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, @@ -250,7 +250,7 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .dst_ops = &xfrm4_dst_ops, + .dst_ops = &xfrm4_dst_ops_template, .dst_lookup = xfrm4_dst_lookup, .get_saddr = xfrm4_get_saddr, .decode_session = _decode_session4, @@ -272,7 +272,7 @@ static struct ctl_table xfrm4_policy_table[] = { { } }; -static int __net_init xfrm4_net_init(struct net *net) +static int __net_init xfrm4_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -300,7 +300,7 @@ static int __net_init xfrm4_net_init(struct net *net) return -ENOMEM; } -static void __net_exit xfrm4_net_exit(struct net *net) +static void __net_exit xfrm4_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -312,12 +312,44 @@ static void __net_exit xfrm4_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm4_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm4_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm4_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template, + sizeof(xfrm4_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops); + if (ret) + return ret; + + ret = xfrm4_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); + + return ret; +} + +static void __net_exit xfrm4_net_exit(struct net *net) +{ + xfrm4_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm4_dst_ops); +} static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, }; -#endif static void __init xfrm4_policy_init(void) { @@ -326,13 +358,9 @@ static void __init xfrm4_policy_init(void) void __init xfrm4_init(void) { - dst_entries_init(&xfrm4_dst_ops); - xfrm4_state_init(); xfrm4_policy_init(); xfrm4_protocol_init(); -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm4_net_ops); -#endif } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index da55e0c85bb8..d51a18d607ac 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -281,7 +281,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, xfrm_dst_ifdown(dst, dev); } -static struct dst_ops xfrm6_dst_ops = { +static struct dst_ops xfrm6_dst_ops_template = { .family = AF_INET6, .gc = xfrm6_garbage_collect, .update_pmtu = xfrm6_update_pmtu, @@ -295,7 +295,7 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .dst_ops = &xfrm6_dst_ops, + .dst_ops = &xfrm6_dst_ops_template, .dst_lookup = xfrm6_dst_lookup, .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, @@ -327,7 +327,7 @@ static struct ctl_table xfrm6_policy_table[] = { { } }; -static int __net_init xfrm6_net_init(struct net *net) +static int __net_init xfrm6_net_sysctl_init(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; @@ -355,7 +355,7 @@ static int __net_init xfrm6_net_init(struct net *net) return -ENOMEM; } -static void __net_exit xfrm6_net_exit(struct net *net) +static void __net_exit xfrm6_net_sysctl_exit(struct net *net) { struct ctl_table *table; @@ -367,24 +367,52 @@ static void __net_exit xfrm6_net_exit(struct net *net) if (!net_eq(net, &init_net)) kfree(table); } +#else /* CONFIG_SYSCTL */ +static int inline xfrm6_net_sysctl_init(struct net *net) +{ + return 0; +} + +static void inline xfrm6_net_sysctl_exit(struct net *net) +{ +} +#endif + +static int __net_init xfrm6_net_init(struct net *net) +{ + int ret; + + memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template, + sizeof(xfrm6_dst_ops_template)); + ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops); + if (ret) + return ret; + + ret = xfrm6_net_sysctl_init(net); + if (ret) + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); + + return ret; +} + +static void __net_exit xfrm6_net_exit(struct net *net) +{ + xfrm6_net_sysctl_exit(net); + dst_entries_destroy(&net->xfrm.xfrm6_dst_ops); +} static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, }; -#endif int __init xfrm6_init(void) { int ret; - dst_entries_init(&xfrm6_dst_ops); - ret = xfrm6_policy_init(); - if (ret) { - dst_entries_destroy(&xfrm6_dst_ops); + if (ret) goto out; - } ret = xfrm6_state_init(); if (ret) goto out_policy; @@ -393,9 +421,7 @@ int __init xfrm6_init(void) if (ret) goto out_state; -#ifdef CONFIG_SYSCTL register_pernet_subsys(&xfrm6_net_ops); -#endif out: return ret; out_state: @@ -407,11 +433,8 @@ int __init xfrm6_init(void) void xfrm6_fini(void) { -#ifdef CONFIG_SYSCTL unregister_pernet_subsys(&xfrm6_net_ops); -#endif xfrm6_protocol_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); - dst_entries_destroy(&xfrm6_dst_ops); } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 94af3d065785..bacd30bda10d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2807,7 +2807,6 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { - struct net *net; int err = 0; if (unlikely(afinfo == NULL)) return -EINVAL; @@ -2838,26 +2837,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) } spin_unlock(&xfrm_policy_afinfo_lock); - rtnl_lock(); - for_each_net(net) { - struct dst_ops *xfrm_dst_ops; - - switch (afinfo->family) { - case AF_INET: - xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops; - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops; - break; -#endif - default: - BUG(); - } - *xfrm_dst_ops = *afinfo->dst_ops; - } - rtnl_unlock(); - return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); @@ -2893,22 +2872,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); -static void __net_init xfrm_dst_ops_init(struct net *net) -{ - struct xfrm_policy_afinfo *afinfo; - - rcu_read_lock(); - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]); - if (afinfo) - net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops; -#if IS_ENABLED(CONFIG_IPV6) - afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]); - if (afinfo) - net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops; -#endif - rcu_read_unlock(); -} - static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -3057,7 +3020,6 @@ static int __net_init xfrm_net_init(struct net *net) rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; - xfrm_dst_ops_init(net); rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; From 1dbe162d53e11665b48a1c122899ffc2c068bef4 Mon Sep 17 00:00:00 2001 From: Dongdong Liu Date: Fri, 4 Dec 2015 16:32:25 -0600 Subject: [PATCH 02/67] PCI: hisi: Fix hisi_pcie_cfg_read() 32-bit reads For 32-bit config reads (size == 4), hisi_pcie_cfg_read() returned success but never filled in the data we read. Return the register data for 32-bit config reads. Without this fix, PCI doesn't work at all because enumeration depends on 32-bit config reads. The driver was tested internally, but got broken in the process of upstreaming, so this fixes the breakage. Fixes: 500a1d9a43e0 ("PCI: hisi: Add HiSilicon SoC Hip05 PCIe driver") Signed-off-by: Dongdong Liu Signed-off-by: Bjorn Helgaas Reviewed-by: Zhou Wang --- drivers/pci/host/pcie-hisi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c index 163671a4f798..77f7c669a1b9 100644 --- a/drivers/pci/host/pcie-hisi.c +++ b/drivers/pci/host/pcie-hisi.c @@ -61,7 +61,9 @@ static int hisi_pcie_cfg_read(struct pcie_port *pp, int where, int size, *val = *(u8 __force *) walker; else if (size == 2) *val = *(u16 __force *) walker; - else if (size != 4) + else if (size == 4) + *val = reg_val; + else return PCIBIOS_BAD_REGISTER_NUMBER; return PCIBIOS_SUCCESSFUL; From 628a2918afe42fae2f90749ad3721853fd06b262 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 3 Dec 2015 17:26:59 +0100 Subject: [PATCH 03/67] iwlwifi: separate firmware version for 7260 devices The 7260 devices aren't going to be updated for completely new firmware versions any more (only bugfixes), and haven't been since API version 17. Encode that in the data structures to avoid trying to load FW images that will never exist. Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach --- drivers/net/wireless/iwlwifi/iwl-7000.c | 49 ++++++++++++++++++------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-7000.c b/drivers/net/wireless/iwlwifi/iwl-7000.c index bf88ec3a65fa..d9a4aee246a6 100644 --- a/drivers/net/wireless/iwlwifi/iwl-7000.c +++ b/drivers/net/wireless/iwlwifi/iwl-7000.c @@ -69,13 +69,19 @@ #include "iwl-agn-hw.h" /* Highest firmware API version supported */ -#define IWL7260_UCODE_API_MAX 19 +#define IWL7260_UCODE_API_MAX 17 +#define IWL7265_UCODE_API_MAX 19 +#define IWL7265D_UCODE_API_MAX 19 /* Oldest version we won't warn about */ #define IWL7260_UCODE_API_OK 13 +#define IWL7265_UCODE_API_OK 13 +#define IWL7265D_UCODE_API_OK 13 /* Lowest firmware API version supported */ #define IWL7260_UCODE_API_MIN 13 +#define IWL7265_UCODE_API_MIN 13 +#define IWL7265D_UCODE_API_MIN 13 /* NVM versions */ #define IWL7260_NVM_VERSION 0x0a1d @@ -149,10 +155,7 @@ static const struct iwl_ht_params iwl7000_ht_params = { .ht40_bands = BIT(IEEE80211_BAND_2GHZ) | BIT(IEEE80211_BAND_5GHZ), }; -#define IWL_DEVICE_7000 \ - .ucode_api_max = IWL7260_UCODE_API_MAX, \ - .ucode_api_ok = IWL7260_UCODE_API_OK, \ - .ucode_api_min = IWL7260_UCODE_API_MIN, \ +#define IWL_DEVICE_7000_COMMON \ .device_family = IWL_DEVICE_FAMILY_7000, \ .max_inst_size = IWL60_RTC_INST_SIZE, \ .max_data_size = IWL60_RTC_DATA_SIZE, \ @@ -163,6 +166,24 @@ static const struct iwl_ht_params iwl7000_ht_params = { .max_ht_ampdu_exponent = IEEE80211_HT_MAX_AMPDU_64K, \ .dccm_offset = IWL7000_DCCM_OFFSET +#define IWL_DEVICE_7000 \ + IWL_DEVICE_7000_COMMON, \ + .ucode_api_max = IWL7260_UCODE_API_MAX, \ + .ucode_api_ok = IWL7260_UCODE_API_OK, \ + .ucode_api_min = IWL7260_UCODE_API_MIN + +#define IWL_DEVICE_7005 \ + IWL_DEVICE_7000_COMMON, \ + .ucode_api_max = IWL7265_UCODE_API_MAX, \ + .ucode_api_ok = IWL7265_UCODE_API_OK, \ + .ucode_api_min = IWL7265_UCODE_API_MIN + +#define IWL_DEVICE_7005D \ + IWL_DEVICE_7000_COMMON, \ + .ucode_api_max = IWL7265D_UCODE_API_MAX, \ + .ucode_api_ok = IWL7265D_UCODE_API_OK, \ + .ucode_api_min = IWL7265D_UCODE_API_MIN + const struct iwl_cfg iwl7260_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 7260", .fw_name_pre = IWL7260_FW_PRE, @@ -266,7 +287,7 @@ static const struct iwl_ht_params iwl7265_ht_params = { const struct iwl_cfg iwl3165_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 3165", .fw_name_pre = IWL7265D_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005D, .ht_params = &iwl7000_ht_params, .nvm_ver = IWL3165_NVM_VERSION, .nvm_calib_ver = IWL3165_TX_POWER_VERSION, @@ -277,7 +298,7 @@ const struct iwl_cfg iwl3165_2ac_cfg = { const struct iwl_cfg iwl7265_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 7265", .fw_name_pre = IWL7265_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -288,7 +309,7 @@ const struct iwl_cfg iwl7265_2ac_cfg = { const struct iwl_cfg iwl7265_2n_cfg = { .name = "Intel(R) Dual Band Wireless N 7265", .fw_name_pre = IWL7265_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -299,7 +320,7 @@ const struct iwl_cfg iwl7265_2n_cfg = { const struct iwl_cfg iwl7265_n_cfg = { .name = "Intel(R) Wireless N 7265", .fw_name_pre = IWL7265_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -310,7 +331,7 @@ const struct iwl_cfg iwl7265_n_cfg = { const struct iwl_cfg iwl7265d_2ac_cfg = { .name = "Intel(R) Dual Band Wireless AC 7265", .fw_name_pre = IWL7265D_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005D, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265D_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -321,7 +342,7 @@ const struct iwl_cfg iwl7265d_2ac_cfg = { const struct iwl_cfg iwl7265d_2n_cfg = { .name = "Intel(R) Dual Band Wireless N 7265", .fw_name_pre = IWL7265D_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005D, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265D_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -332,7 +353,7 @@ const struct iwl_cfg iwl7265d_2n_cfg = { const struct iwl_cfg iwl7265d_n_cfg = { .name = "Intel(R) Wireless N 7265", .fw_name_pre = IWL7265D_FW_PRE, - IWL_DEVICE_7000, + IWL_DEVICE_7005D, .ht_params = &iwl7265_ht_params, .nvm_ver = IWL7265D_NVM_VERSION, .nvm_calib_ver = IWL7265_TX_POWER_VERSION, @@ -342,5 +363,5 @@ const struct iwl_cfg iwl7265d_n_cfg = { MODULE_FIRMWARE(IWL7260_MODULE_FIRMWARE(IWL7260_UCODE_API_OK)); MODULE_FIRMWARE(IWL3160_MODULE_FIRMWARE(IWL7260_UCODE_API_OK)); -MODULE_FIRMWARE(IWL7265_MODULE_FIRMWARE(IWL7260_UCODE_API_OK)); -MODULE_FIRMWARE(IWL7265D_MODULE_FIRMWARE(IWL7260_UCODE_API_OK)); +MODULE_FIRMWARE(IWL7265_MODULE_FIRMWARE(IWL7265_UCODE_API_OK)); +MODULE_FIRMWARE(IWL7265D_MODULE_FIRMWARE(IWL7265D_UCODE_API_OK)); From 4585436091cd812b1165aab71bd4847ea1cb08ec Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 11 Dec 2015 09:06:25 +0100 Subject: [PATCH 04/67] iwlwifi: mvm: protect RCU dereference in iwl_mvm_get_key_sta_id Properly protect the RCU dereference in iwl_mvm_get_key_sta_id() when coming from iwl_mvm_update_tkip_key() which cannot hold the mvm->mutex by moving the call into the RCU critical section. Modify the check to use rcu_dereference_check() to permit this. Fixes: 9513c5e18a0d ("iwlwifi: mvm: Avoid dereferencing sta if it was already flushed") Reported-by: Laura Abbott Signed-off-by: Johannes Berg Signed-off-by: Emmanuel Grumbach --- drivers/net/wireless/iwlwifi/mvm/sta.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/mvm/sta.c b/drivers/net/wireless/iwlwifi/mvm/sta.c index 354acbde088e..2b976b110207 100644 --- a/drivers/net/wireless/iwlwifi/mvm/sta.c +++ b/drivers/net/wireless/iwlwifi/mvm/sta.c @@ -1222,8 +1222,8 @@ static u8 iwl_mvm_get_key_sta_id(struct iwl_mvm *mvm, mvmvif->ap_sta_id != IWL_MVM_STATION_COUNT) { u8 sta_id = mvmvif->ap_sta_id; - sta = rcu_dereference_protected(mvm->fw_id_to_mac_id[sta_id], - lockdep_is_held(&mvm->mutex)); + sta = rcu_dereference_check(mvm->fw_id_to_mac_id[sta_id], + lockdep_is_held(&mvm->mutex)); /* * It is possible that the 'sta' parameter is NULL, * for example when a GTK is removed - the sta_id will then @@ -1590,14 +1590,15 @@ void iwl_mvm_update_tkip_key(struct iwl_mvm *mvm, u16 *phase1key) { struct iwl_mvm_sta *mvm_sta; - u8 sta_id = iwl_mvm_get_key_sta_id(mvm, vif, sta); + u8 sta_id; bool mcast = !(keyconf->flags & IEEE80211_KEY_FLAG_PAIRWISE); - if (WARN_ON_ONCE(sta_id == IWL_MVM_STATION_COUNT)) - return; - rcu_read_lock(); + sta_id = iwl_mvm_get_key_sta_id(mvm, vif, sta); + if (WARN_ON_ONCE(sta_id == IWL_MVM_STATION_COUNT)) + goto unlock; + if (!sta) { sta = rcu_dereference(mvm->fw_id_to_mac_id[sta_id]); if (WARN_ON(IS_ERR_OR_NULL(sta))) { @@ -1609,6 +1610,8 @@ void iwl_mvm_update_tkip_key(struct iwl_mvm *mvm, mvm_sta = iwl_mvm_sta_from_mac80211(sta); iwl_mvm_send_sta_key(mvm, mvm_sta, keyconf, mcast, iv32, phase1key, CMD_ASYNC, keyconf->hw_key_idx); + + unlock: rcu_read_unlock(); } From aa47e42c60dfa31f81a3fe357451acfe1a12ca1e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 15 Dec 2015 21:29:16 +0100 Subject: [PATCH 05/67] netfilter: nf_tables: use skb->protocol instead of assuming ethernet header Otherwise we may end up with incorrect network and transport header for other protocols. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 7b9c053ba750..edb3502f2016 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -94,7 +94,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, { struct nft_pktinfo pkt; - switch (eth_hdr(skb)->h_proto) { + switch (skb->protocol) { case htons(ETH_P_IP): nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); break; From d5f79b6e4d169039903cc869e16e59ad861dd479 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 18 Dec 2015 14:32:07 +0100 Subject: [PATCH 06/67] netfilter: nft_ct: include direction when dumping NFT_CT_L3PROTOCOL key one nft userspace test case fails with 'ct l3proto original ipv4' mismatches 'ct l3proto ipv4' ... because NFTA_CT_DIRECTION attr is missing. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_ct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 8cbca3432f90..939921532764 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -366,6 +366,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { + case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: From 1873c58d4a45bd4d7104ba1482fcd9c3bd094cd1 Mon Sep 17 00:00:00 2001 From: "Pascal Speck (Iktek)" Date: Fri, 4 Dec 2015 16:55:17 +0100 Subject: [PATCH 07/67] ethernet:ti:cpsw: fix phy identification with multiple slaves on fixed-phy When using more than one slave with ti cpsw and fixed phy the pd->phy_id will be always zero, but slave_data->phy_id must be unique. pd->phy_id means a "phy hardware id" whereas slave_data->phy_id means an "unique id", so we should use pd->addr which has the same unique meaning. Fixes: 1f71e8c96fc6 ("drivers: net: cpsw: Add support for fixed-link PHY") Signed-off-by: Pascal Speck Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 48b92c9de12a..e3b220de3ed4 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2047,7 +2047,7 @@ static int cpsw_probe_dt(struct cpsw_priv *priv, if (!pd) return -ENODEV; snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), - PHY_ID_FMT, pd->bus->id, pd->phy_id); + PHY_ID_FMT, pd->bus->id, pd->addr); goto no_phy_slave; } parp = of_get_property(slave_node, "phy_id", &lenp); From f1eea5c15ae799a1291f0f481fa3ea09be913fa9 Mon Sep 17 00:00:00 2001 From: David Rivshin Date: Wed, 16 Dec 2015 23:02:10 -0500 Subject: [PATCH 08/67] drivers: net: cpsw: fix RMII/RGMII mode when used with fixed-link PHY Commit 1f71e8c96fc654724723ce987e0a8b2aeb81746d ("drivers: net: cpsw: Add support for fixed-link PHY") did not parse the "phy-mode" property in the case of a fixed-link PHY, leaving slave_data->phy_if with its default of PHY_INTERFACE_MODE_NA(0). This later gets passed to phy_connect() in cpsw_slave_open(), and eventually to cpsw_phy_sel() where it hits a default case that configures the MAC for MII mode. The user visible symptom is that while kernel log messages seem to indicate that the interface is set up, there is no network communication. Eventually a watchdog error occurs: NETDEV WATCHDOG: eth0 (cpsw): transmit queue 0 timed out Fixes: 1f71e8c96fc6 ("drivers: net: cpsw: Add support for fixed-link PHY") Signed-off-by: David Rivshin Signed-off-by: David S. Miller --- .../devicetree/bindings/net/cpsw.txt | 6 +-- drivers/net/ethernet/ti/cpsw.c | 40 ++++++++++--------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt index 9853f8e70966..28a4781ab6d7 100644 --- a/Documentation/devicetree/bindings/net/cpsw.txt +++ b/Documentation/devicetree/bindings/net/cpsw.txt @@ -40,18 +40,18 @@ Optional properties: Slave Properties: Required properties: -- phy_id : Specifies slave phy id - phy-mode : See ethernet.txt file in the same directory Optional properties: - dual_emac_res_vlan : Specifies VID to be used to segregate the ports - mac-address : See ethernet.txt file in the same directory +- phy_id : Specifies slave phy id - phy-handle : See ethernet.txt file in the same directory Slave sub-nodes: - fixed-link : See fixed-link.txt file in the same directory - Either the properties phy_id and phy-mode, - or the sub-node fixed-link can be specified + Either the property phy_id, or the sub-node + fixed-link can be specified Note: "ti,hwmods" field is used to fetch the base address and irq resources from TI, omap hwmod data base during device registration. diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index e3b220de3ed4..bc6d20dc28a0 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2026,17 +2026,15 @@ static int cpsw_probe_dt(struct cpsw_priv *priv, for_each_child_of_node(node, slave_node) { struct cpsw_slave_data *slave_data = data->slave_data + i; const void *mac_addr = NULL; - u32 phyid; int lenp; const __be32 *parp; - struct device_node *mdio_node; - struct platform_device *mdio; /* This is no slave child node, continue */ if (strcmp(slave_node->name, "slave")) continue; priv->phy_node = of_parse_phandle(slave_node, "phy-handle", 0); + parp = of_get_property(slave_node, "phy_id", &lenp); if (of_phy_is_fixed_link(slave_node)) { struct phy_device *pd; @@ -2048,23 +2046,29 @@ static int cpsw_probe_dt(struct cpsw_priv *priv, return -ENODEV; snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), PHY_ID_FMT, pd->bus->id, pd->addr); + } else if (parp) { + u32 phyid; + struct device_node *mdio_node; + struct platform_device *mdio; + + if (lenp != (sizeof(__be32) * 2)) { + dev_err(&pdev->dev, "Invalid slave[%d] phy_id property\n", i); + goto no_phy_slave; + } + mdio_node = of_find_node_by_phandle(be32_to_cpup(parp)); + phyid = be32_to_cpup(parp+1); + mdio = of_find_device_by_node(mdio_node); + of_node_put(mdio_node); + if (!mdio) { + dev_err(&pdev->dev, "Missing mdio platform device\n"); + return -EINVAL; + } + snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), + PHY_ID_FMT, mdio->name, phyid); + } else { + dev_err(&pdev->dev, "No slave[%d] phy_id or fixed-link property\n", i); goto no_phy_slave; } - parp = of_get_property(slave_node, "phy_id", &lenp); - if ((parp == NULL) || (lenp != (sizeof(void *) * 2))) { - dev_err(&pdev->dev, "Missing slave[%d] phy_id property\n", i); - goto no_phy_slave; - } - mdio_node = of_find_node_by_phandle(be32_to_cpup(parp)); - phyid = be32_to_cpup(parp+1); - mdio = of_find_device_by_node(mdio_node); - of_node_put(mdio_node); - if (!mdio) { - dev_err(&pdev->dev, "Missing mdio platform device\n"); - return -EINVAL; - } - snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), - PHY_ID_FMT, mdio->name, phyid); slave_data->phy_if = of_get_phy_mode(slave_node); if (slave_data->phy_if < 0) { dev_err(&pdev->dev, "Missing or malformed slave[%d] phy-mode property\n", From dfc0a6d39aad6d633141726eb2e37e15bda1fccd Mon Sep 17 00:00:00 2001 From: David Rivshin Date: Wed, 16 Dec 2015 23:02:11 -0500 Subject: [PATCH 09/67] drivers: net: cpsw: increment reference count on fixed-link PHY node When a fixed-link sub-node exists in a slave node, the slave node is also the PHY node. Since this is a separate use of the slave node, of_node_get() should be used to increment the reference count. Fixes: 1f71e8c96fc6 ("drivers: net: cpsw: Add support for fixed-link PHY") Signed-off-by: David Rivshin Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index bc6d20dc28a0..3b489caea096 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2036,16 +2036,21 @@ static int cpsw_probe_dt(struct cpsw_priv *priv, priv->phy_node = of_parse_phandle(slave_node, "phy-handle", 0); parp = of_get_property(slave_node, "phy_id", &lenp); if (of_phy_is_fixed_link(slave_node)) { - struct phy_device *pd; + struct device_node *phy_node; + struct phy_device *phy_dev; + /* In the case of a fixed PHY, the DT node associated + * to the PHY is the Ethernet MAC DT node. + */ ret = of_phy_register_fixed_link(slave_node); if (ret) return ret; - pd = of_phy_find_device(slave_node); - if (!pd) + phy_node = of_node_get(slave_node); + phy_dev = of_phy_find_device(phy_node); + if (!phy_dev) return -ENODEV; snprintf(slave_data->phy_id, sizeof(slave_data->phy_id), - PHY_ID_FMT, pd->bus->id, pd->addr); + PHY_ID_FMT, phy_dev->bus->id, phy_dev->addr); } else if (parp) { u32 phyid; struct device_node *mdio_node; From fc9f5ea9b4ecbe9b7839c92f0a54261809c723d3 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 17 Dec 2015 15:35:37 +0200 Subject: [PATCH 10/67] net/mlx4_en: Remove dependency between timestamping capability and service_task Service task is responsible for other tasks in addition to timestamping overflow check. Launch it even if timestamping is not supported by device. Fixes: 07841f9d94c1 ('net/mlx4_en: Schedule napi when RX buffers allocation fails') Signed-off-by: Eugenia Emantayev Signed-off-by: Eran Ben Elisha Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 886e1bc86374..4eef316bbc82 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -3058,9 +3058,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); - if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) - queue_delayed_work(mdev->workqueue, &priv->service_task, - SERVICE_TASK_DELAY); + queue_delayed_work(mdev->workqueue, &priv->service_task, + SERVICE_TASK_DELAY); mlx4_en_set_stats_bitmap(mdev->dev, &priv->stats_bitmap, mdev->profile.prof[priv->port].rx_ppp, From 90683061dd50b0d70f01466c2d694f4e928a86f3 Mon Sep 17 00:00:00 2001 From: Eugenia Emantayev Date: Thu, 17 Dec 2015 15:35:38 +0200 Subject: [PATCH 11/67] net/mlx4_en: Fix HW timestamp init issue upon system startup mlx4_en_init_timestamp was called before creation of netdev and port init, thus used uninitialized values. Specifically - NIC frequency was incorrect causing wrong calculations and later wrong HW timestamps. Fixes: 1ec4864b1017 ('net/mlx4_en: Fixed crash when port type is changed') Signed-off-by: Eugenia Emantayev Signed-off-by: Marina Varshaver Signed-off-by: Eran Ben Elisha Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_clock.c | 7 +++++++ drivers/net/ethernet/mellanox/mlx4/en_main.c | 7 ------- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 7 +++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c index 8a083d73efdb..038f9ce391e6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c @@ -242,6 +242,13 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev) unsigned long flags; u64 ns, zero = 0; + /* mlx4_en_init_timestamp is called for each netdev. + * mdev->ptp_clock is common for all ports, skip initialization if + * was done for other port. + */ + if (mdev->ptp_clock) + return; + rwlock_init(&mdev->clock_lock); memset(&mdev->cycles, 0, sizeof(mdev->cycles)); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c index 005f910ec955..e0ec280a7fa1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -232,9 +232,6 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) if (mdev->pndev[i]) mlx4_en_destroy_netdev(mdev->pndev[i]); - if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) - mlx4_en_remove_timestamp(mdev); - flush_workqueue(mdev->workqueue); destroy_workqueue(mdev->workqueue); (void) mlx4_mr_free(dev, &mdev->mr); @@ -320,10 +317,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev) mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) mdev->port_cnt++; - /* Initialize time stamp mechanism */ - if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) - mlx4_en_init_timestamp(mdev); - /* Set default number of RX rings*/ mlx4_en_set_num_rx_rings(mdev); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 4eef316bbc82..7869f97de5da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2072,6 +2072,9 @@ void mlx4_en_destroy_netdev(struct net_device *dev) /* flush any pending task for this netdev */ flush_workqueue(mdev->workqueue); + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + mlx4_en_remove_timestamp(mdev); + /* Detach the netdev so tasks would not attempt to access it */ mutex_lock(&mdev->state_lock); mdev->pndev[priv->port] = NULL; @@ -3058,6 +3061,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); + /* Initialize time stamp mechanism */ + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) + mlx4_en_init_timestamp(mdev); + queue_delayed_work(mdev->workqueue, &priv->service_task, SERVICE_TASK_DELAY); From 6e3cd5fa65318f35ec9c9f61bc5cdb55d4783cb9 Mon Sep 17 00:00:00 2001 From: Venkat Duvvuru Date: Fri, 18 Dec 2015 01:40:50 +0530 Subject: [PATCH 12/67] be2net: Avoid accessing eq object in be_msix_register routine, when i < 0. When the first request_irq fails in be_msix_register, i value would be zero. The current code decrements the i value and accesses the eq object without validating the decremented "i" value. This can cause an "invalid memory address access" violation. This patch fixes the problem by accessing the eq object after validating the "i" value. Signed-off-by: Venkat Duvvuru Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index b6ad02909d6b..65988202f954 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -3299,8 +3299,10 @@ static int be_msix_register(struct be_adapter *adapter) return 0; err_msix: - for (i--, eqo = &adapter->eq_obj[i]; i >= 0; i--, eqo--) + for (i--; i >= 0; i--) { + eqo = &adapter->eq_obj[i]; free_irq(be_msix_vec_get(adapter, eqo), eqo); + } dev_warn(&adapter->pdev->dev, "MSIX Request IRQ failed - err %d\n", status); be_msix_disable(adapter); From acf673a3187edf72068ee2f92f4dc47d66baed47 Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 17 Dec 2015 16:05:32 -0500 Subject: [PATCH 13/67] 6pack: Fix use after free in sixpack_close(). Need to do the unregister_device() after all references to the driver private have been done. Also we need to use del_timer_sync() for the timers so that we don't have any asynchronous references after the unregister. Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 7c4a4151ef0f..9f0b1c342b77 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -683,14 +683,14 @@ static void sixpack_close(struct tty_struct *tty) if (!atomic_dec_and_test(&sp->refcnt)) down(&sp->dead_sem); - unregister_netdev(sp->dev); - - del_timer(&sp->tx_t); - del_timer(&sp->resync_t); + del_timer_sync(&sp->tx_t); + del_timer_sync(&sp->resync_t); /* Free all 6pack frame buffers. */ kfree(sp->rbuff); kfree(sp->xbuff); + + unregister_netdev(sp->dev); } /* Perform I/O control on an active 6pack channel. */ From d79f16c046086f4fe0d42184a458e187464eb83e Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 17 Dec 2015 16:05:49 -0500 Subject: [PATCH 14/67] mkiss: Fix use after free in mkiss_close(). Need to do the unregister_device() after all references to the driver private have been done. Signed-off-by: David S. Miller --- drivers/net/hamradio/mkiss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 216bfd350169..0b72b9de5207 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -798,13 +798,13 @@ static void mkiss_close(struct tty_struct *tty) if (!atomic_dec_and_test(&ax->refcnt)) down(&ax->dead_sem); - unregister_netdev(ax->dev); - /* Free all AX25 frame buffers. */ kfree(ax->rbuff); kfree(ax->xbuff); ax->tty = NULL; + + unregister_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ From 6d3c348a63685410b12bf961b97063efeef2f901 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 17 Dec 2015 16:46:39 -0800 Subject: [PATCH 15/67] ipip: ioctl: Remove superfluous IP-TTL handling. IP-TTL case is already handled in ip_tunnel_ioctl() API. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index f34c31defafe..a09fb0dec725 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -253,9 +253,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) p.i_key = p.o_key = 0; p.i_flags = p.o_flags = 0; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - err = ip_tunnel_ioctl(dev, &p, cmd); if (err) return err; From ea2465af3bbfa7994d134a401503966ee98710b6 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 18 Dec 2015 10:42:12 +0200 Subject: [PATCH 16/67] bnx2x: Prevent FW assertion when using Vxlan FW has a rare corner case in which a fragmented packet using lots of frags would not be linearized, causing the FW to assert while trying to transmit the packet. To prevent this, we need to make sure the window of fragements containing MSS worth of data contains 1 BD less than for regular packets due to the additional parsing BD. Signed-off-by: Yuval Mintz Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index f8d7a2f06950..c82ab87fcbe8 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -3430,25 +3430,29 @@ static u32 bnx2x_xmit_type(struct bnx2x *bp, struct sk_buff *skb) return rc; } -#if (MAX_SKB_FRAGS >= MAX_FETCH_BD - 3) +/* VXLAN: 4 = 1 (for linear data BD) + 3 (2 for PBD and last BD) */ +#define BNX2X_NUM_VXLAN_TSO_WIN_SUB_BDS 4 + +/* Regular: 3 = 1 (for linear data BD) + 2 (for PBD and last BD) */ +#define BNX2X_NUM_TSO_WIN_SUB_BDS 3 + +#if (MAX_SKB_FRAGS >= MAX_FETCH_BD - BDS_PER_TX_PKT) /* check if packet requires linearization (packet is too fragmented) no need to check fragmentation if page size > 8K (there will be no violation to FW restrictions) */ static int bnx2x_pkt_req_lin(struct bnx2x *bp, struct sk_buff *skb, u32 xmit_type) { - int to_copy = 0; - int hlen = 0; - int first_bd_sz = 0; + int first_bd_sz = 0, num_tso_win_sub = BNX2X_NUM_TSO_WIN_SUB_BDS; + int to_copy = 0, hlen = 0; - /* 3 = 1 (for linear data BD) + 2 (for PBD and last BD) */ - if (skb_shinfo(skb)->nr_frags >= (MAX_FETCH_BD - 3)) { + if (xmit_type & XMIT_GSO_ENC) + num_tso_win_sub = BNX2X_NUM_VXLAN_TSO_WIN_SUB_BDS; + if (skb_shinfo(skb)->nr_frags >= (MAX_FETCH_BD - num_tso_win_sub)) { if (xmit_type & XMIT_GSO) { unsigned short lso_mss = skb_shinfo(skb)->gso_size; - /* Check if LSO packet needs to be copied: - 3 = 1 (for headers BD) + 2 (for PBD and last BD) */ - int wnd_size = MAX_FETCH_BD - 3; + int wnd_size = MAX_FETCH_BD - num_tso_win_sub; /* Number of windows to check */ int num_wnds = skb_shinfo(skb)->nr_frags - wnd_size; int wnd_idx = 0; From e905eabc90a5b787d8708df164543ee295bea5f2 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 18 Dec 2015 19:43:15 +0900 Subject: [PATCH 17/67] openvswitch: correct encoding of set tunnel action attributes In a set action tunnel attributes should be encoded in a nested action. I noticed this because ovs-dpctl was reporting an error when dumping flows due to the incorrect encoding of tunnel attributes in a set action. Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.") Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 907d6fd28ede..d1bd4a45ca2d 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2434,7 +2434,10 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) if (!start) return -EMSGSIZE; - err = ovs_nla_put_tunnel_info(skb, tun_info); + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); if (err) return err; nla_nest_end(skb, start); From b4a1b4f5047e4f54e194681125c74c0aa64d637d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 18 Dec 2015 01:34:26 +0000 Subject: [PATCH 18/67] KEYS: Fix race between read and revoke This fixes CVE-2015-7550. There's a race between keyctl_read() and keyctl_revoke(). If the revoke happens between keyctl_read() checking the validity of a key and the key's semaphore being taken, then the key type read method will see a revoked key. This causes a problem for the user-defined key type because it assumes in its read method that there will always be a payload in a non-revoked key and doesn't check for a NULL pointer. Fix this by making keyctl_read() check the validity of a key after taking semaphore instead of before. I think the bug was introduced with the original keyrings code. This was discovered by a multithreaded test program generated by syzkaller (http://github.com/google/syzkaller). Here's a cleaned up version: #include #include #include void *thr0(void *arg) { key_serial_t key = (unsigned long)arg; keyctl_revoke(key); return 0; } void *thr1(void *arg) { key_serial_t key = (unsigned long)arg; char buffer[16]; keyctl_read(key, buffer, 16); return 0; } int main() { key_serial_t key = add_key("user", "%", "foo", 3, KEY_SPEC_USER_KEYRING); pthread_t th[5]; pthread_create(&th[0], 0, thr0, (void *)(unsigned long)key); pthread_create(&th[1], 0, thr1, (void *)(unsigned long)key); pthread_create(&th[2], 0, thr0, (void *)(unsigned long)key); pthread_create(&th[3], 0, thr1, (void *)(unsigned long)key); pthread_join(th[0], 0); pthread_join(th[1], 0); pthread_join(th[2], 0); pthread_join(th[3], 0); return 0; } Build as: cc -o keyctl-race keyctl-race.c -lkeyutils -lpthread Run as: while keyctl-race; do :; done as it may need several iterations to crash the kernel. The crash can be summarised as: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: [] user_read+0x56/0xa3 ... Call Trace: [] keyctl_read_key+0xb6/0xd7 [] SyS_keyctl+0x83/0xe0 [] entry_SYSCALL_64_fastpath+0x12/0x6f Reported-by: Dmitry Vyukov Signed-off-by: David Howells Tested-by: Dmitry Vyukov Cc: stable@vger.kernel.org Signed-off-by: James Morris --- security/keys/keyctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index fb111eafcb89..1c3872aeed14 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -751,16 +751,16 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) /* the key is probably readable - now try to read it */ can_read_key: - ret = key_validate(key); - if (ret == 0) { - ret = -EOPNOTSUPP; - if (key->type->read) { - /* read the data with the semaphore held (since we - * might sleep) */ - down_read(&key->sem); + ret = -EOPNOTSUPP; + if (key->type->read) { + /* Read the data with the semaphore held (since we might sleep) + * to protect against the key being updated or revoked. + */ + down_read(&key->sem); + ret = key_validate(key); + if (ret == 0) ret = key->type->read(key, buffer, buflen); - up_read(&key->sem); - } + up_read(&key->sem); } error2: From 179ccc0a73641ffd24e44ff10a7bd494efe98d8d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 19 Dec 2015 10:45:28 +0800 Subject: [PATCH 19/67] rhashtable: Kill harmless RCU warning in rhashtable_walk_init The commit c6ff5268293ef98e48a99597e765ffc417e39fa5 ("rhashtable: Fix walker list corruption") causes a suspicious RCU usage warning because we no longer hold ht->mutex when we dereference ht->tbl. However, this is a false positive because we now hold ht->lock which also guarantees that ht->tbl won't disppear from under us. This patch kills the warning by using rcu_dereference_protected. Reported-by: kernel test robot Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- lib/rhashtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index eb9240c458fa..51282f579760 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -519,7 +519,8 @@ int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) return -ENOMEM; spin_lock(&ht->lock); - iter->walker->tbl = rht_dereference(ht->tbl, ht); + iter->walker->tbl = + rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); list_add(&iter->walker->list, &iter->walker->tbl->walkers); spin_unlock(&ht->lock); From 45af55006c2c8f49bddc6296224e70d752a1372c Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 19 Dec 2015 15:13:49 +0300 Subject: [PATCH 20/67] natsemi: add checks for dma mapping errors refill_rx() and start_tx() do not check if mapping dma memory succeed. The patch adds the checks and failure handling. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- drivers/net/ethernet/natsemi/natsemi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c index b83f7c0fcf99..122c2ee3dfe2 100644 --- a/drivers/net/ethernet/natsemi/natsemi.c +++ b/drivers/net/ethernet/natsemi/natsemi.c @@ -1937,6 +1937,12 @@ static void refill_rx(struct net_device *dev) break; /* Better luck next round. */ np->rx_dma[entry] = pci_map_single(np->pci_dev, skb->data, buflen, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(np->pci_dev, + np->rx_dma[entry])) { + dev_kfree_skb_any(skb); + np->rx_skbuff[entry] = NULL; + break; /* Better luck next round. */ + } np->rx_ring[entry].addr = cpu_to_le32(np->rx_dma[entry]); } np->rx_ring[entry].cmd_status = cpu_to_le32(np->rx_buf_sz); @@ -2093,6 +2099,12 @@ static netdev_tx_t start_tx(struct sk_buff *skb, struct net_device *dev) np->tx_skbuff[entry] = skb; np->tx_dma[entry] = pci_map_single(np->pci_dev, skb->data,skb->len, PCI_DMA_TODEVICE); + if (pci_dma_mapping_error(np->pci_dev, np->tx_dma[entry])) { + np->tx_skbuff[entry] = NULL; + dev_kfree_skb_irq(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; + } np->tx_ring[entry].addr = cpu_to_le32(np->tx_dma[entry]); From 670c0d62ea5d16026adde5f3538b1caaa904a909 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 18 Dec 2015 14:43:33 +0100 Subject: [PATCH 21/67] net: usb: cdc_ncm: Adding Dell DW5812 LTE Verizon Mobile Broadband Card Unlike DW5550, Dell DW5812 is a mobile broadband card with no ARP capabilities: the patch makes this device to use wwan_noarp_info struct Signed-off-by: Daniele Palmas Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 1e9843a41168..3a71e60254eb 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1558,6 +1558,15 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long) &wwan_info, }, + /* DW5812 LTE Verizon Mobile Broadband Card + * Unlike DW5550 this device requires FLAG_NOARP + */ + { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bb, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_noarp_info, + }, + /* Dell branded MBM devices like DW5550 */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, From fb83d5f283dc699c891b30c341e758d9a060a7c6 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Fri, 18 Dec 2015 14:43:34 +0100 Subject: [PATCH 22/67] net: usb: cdc_ncm: Adding Dell DW5813 LTE AT&T Mobile Broadband Card Unlike DW5550, Dell DW5813 is a mobile broadband card with no ARP capabilities: the patch makes this device to use wwan_noarp_info struct Signed-off-by: Daniele Palmas Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 3a71e60254eb..369405271437 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1567,6 +1567,15 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&wwan_noarp_info, }, + /* DW5813 LTE AT&T Mobile Broadband Card + * Unlike DW5550 this device requires FLAG_NOARP + */ + { USB_DEVICE_AND_INTERFACE_INFO(0x413c, 0x81bc, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_noarp_info, + }, + /* Dell branded MBM devices like DW5550 */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, From 0d96e4bab2855a030077cc695a3563fd7cb0e7d8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 18 Dec 2015 19:16:57 +0800 Subject: [PATCH 23/67] crypto: algif_skcipher - Use new skcipher interface This patch replaces uses of ablkcipher with the new skcipher interface. Cc: stable@vger.kernel.org Signed-off-by: Herbert Xu Tested-by: --- crypto/algif_skcipher.c | 61 ++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index af31a0ee4057..973fe45e0515 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -47,7 +47,7 @@ struct skcipher_ctx { bool merge; bool enc; - struct ablkcipher_request req; + struct skcipher_request req; }; struct skcipher_async_rsgl { @@ -64,13 +64,13 @@ struct skcipher_async_req { }; #define GET_SREQ(areq, ctx) (struct skcipher_async_req *)((char *)areq + \ - crypto_ablkcipher_reqsize(crypto_ablkcipher_reqtfm(&ctx->req))) + crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req))) #define GET_REQ_SIZE(ctx) \ - crypto_ablkcipher_reqsize(crypto_ablkcipher_reqtfm(&ctx->req)) + crypto_skcipher_reqsize(crypto_skcipher_reqtfm(&ctx->req)) #define GET_IV_SIZE(ctx) \ - crypto_ablkcipher_ivsize(crypto_ablkcipher_reqtfm(&ctx->req)) + crypto_skcipher_ivsize(crypto_skcipher_reqtfm(&ctx->req)) #define MAX_SGL_ENTS ((4096 - sizeof(struct skcipher_sg_list)) / \ sizeof(struct scatterlist) - 1) @@ -302,8 +302,8 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct skcipher_ctx *ctx = ask->private; - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req); - unsigned ivsize = crypto_ablkcipher_ivsize(tfm); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(&ctx->req); + unsigned ivsize = crypto_skcipher_ivsize(tfm); struct skcipher_sg_list *sgl; struct af_alg_control con = {}; long copied = 0; @@ -507,7 +507,7 @@ static int skcipher_recvmsg_async(struct socket *sock, struct msghdr *msg, struct skcipher_sg_list *sgl; struct scatterlist *sg; struct skcipher_async_req *sreq; - struct ablkcipher_request *req; + struct skcipher_request *req; struct skcipher_async_rsgl *last_rsgl = NULL; unsigned int txbufs = 0, len = 0, tx_nents = skcipher_all_sg_nents(ctx); unsigned int reqlen = sizeof(struct skcipher_async_req) + @@ -531,9 +531,9 @@ static int skcipher_recvmsg_async(struct socket *sock, struct msghdr *msg, } sg_init_table(sreq->tsg, tx_nents); memcpy(sreq->iv, ctx->iv, GET_IV_SIZE(ctx)); - ablkcipher_request_set_tfm(req, crypto_ablkcipher_reqtfm(&ctx->req)); - ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - skcipher_async_cb, sk); + skcipher_request_set_tfm(req, crypto_skcipher_reqtfm(&ctx->req)); + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + skcipher_async_cb, sk); while (iov_iter_count(&msg->msg_iter)) { struct skcipher_async_rsgl *rsgl; @@ -608,10 +608,10 @@ static int skcipher_recvmsg_async(struct socket *sock, struct msghdr *msg, if (mark) sg_mark_end(sreq->tsg + txbufs - 1); - ablkcipher_request_set_crypt(req, sreq->tsg, sreq->first_sgl.sgl.sg, - len, sreq->iv); - err = ctx->enc ? crypto_ablkcipher_encrypt(req) : - crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, sreq->tsg, sreq->first_sgl.sgl.sg, + len, sreq->iv); + err = ctx->enc ? crypto_skcipher_encrypt(req) : + crypto_skcipher_decrypt(req); if (err == -EINPROGRESS) { atomic_inc(&ctx->inflight); err = -EIOCBQUEUED; @@ -632,7 +632,7 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); struct skcipher_ctx *ctx = ask->private; - unsigned bs = crypto_ablkcipher_blocksize(crypto_ablkcipher_reqtfm( + unsigned bs = crypto_skcipher_blocksize(crypto_skcipher_reqtfm( &ctx->req)); struct skcipher_sg_list *sgl; struct scatterlist *sg; @@ -669,14 +669,13 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, if (!used) goto free; - ablkcipher_request_set_crypt(&ctx->req, sg, - ctx->rsgl.sg, used, - ctx->iv); + skcipher_request_set_crypt(&ctx->req, sg, ctx->rsgl.sg, used, + ctx->iv); err = af_alg_wait_for_completion( ctx->enc ? - crypto_ablkcipher_encrypt(&ctx->req) : - crypto_ablkcipher_decrypt(&ctx->req), + crypto_skcipher_encrypt(&ctx->req) : + crypto_skcipher_decrypt(&ctx->req), &ctx->completion); free: @@ -751,17 +750,17 @@ static struct proto_ops algif_skcipher_ops = { static void *skcipher_bind(const char *name, u32 type, u32 mask) { - return crypto_alloc_ablkcipher(name, type, mask); + return crypto_alloc_skcipher(name, type, mask); } static void skcipher_release(void *private) { - crypto_free_ablkcipher(private); + crypto_free_skcipher(private); } static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen) { - return crypto_ablkcipher_setkey(private, key, keylen); + return crypto_skcipher_setkey(private, key, keylen); } static void skcipher_wait(struct sock *sk) @@ -778,13 +777,13 @@ static void skcipher_sock_destruct(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); struct skcipher_ctx *ctx = ask->private; - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(&ctx->req); + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(&ctx->req); if (atomic_read(&ctx->inflight)) skcipher_wait(sk); skcipher_free_sgl(sk); - sock_kzfree_s(sk, ctx->iv, crypto_ablkcipher_ivsize(tfm)); + sock_kzfree_s(sk, ctx->iv, crypto_skcipher_ivsize(tfm)); sock_kfree_s(sk, ctx, ctx->len); af_alg_release_parent(sk); } @@ -793,20 +792,20 @@ static int skcipher_accept_parent(void *private, struct sock *sk) { struct skcipher_ctx *ctx; struct alg_sock *ask = alg_sk(sk); - unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(private); + unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(private); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) return -ENOMEM; - ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(private), + ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(private), GFP_KERNEL); if (!ctx->iv) { sock_kfree_s(sk, ctx, len); return -ENOMEM; } - memset(ctx->iv, 0, crypto_ablkcipher_ivsize(private)); + memset(ctx->iv, 0, crypto_skcipher_ivsize(private)); INIT_LIST_HEAD(&ctx->tsgl); ctx->len = len; @@ -819,9 +818,9 @@ static int skcipher_accept_parent(void *private, struct sock *sk) ask->private = ctx; - ablkcipher_request_set_tfm(&ctx->req, private); - ablkcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, - af_alg_complete, &ctx->completion); + skcipher_request_set_tfm(&ctx->req, private); + skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, + af_alg_complete, &ctx->completion); sk->sk_destruct = skcipher_sock_destruct; From ce8c839b74e3017996fad4e1b7ba2e2625ede82f Mon Sep 17 00:00:00 2001 From: Vijay Pandurangan Date: Fri, 18 Dec 2015 14:34:59 -0500 Subject: [PATCH 24/67] =?UTF-8?q?veth:=20don=E2=80=99t=20modify=20ip=5Fsum?= =?UTF-8?q?med;=20doing=20so=20treats=20packets=20with=20bad=20checksums?= =?UTF-8?q?=20as=20good.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Packets that arrive from real hardware devices have ip_summed == CHECKSUM_UNNECESSARY if the hardware verified the checksums, or CHECKSUM_NONE if the packet is bad or it was unable to verify it. The current version of veth will replace CHECKSUM_NONE with CHECKSUM_UNNECESSARY, which causes corrupt packets routed from hardware to a veth device to be delivered to the application. This caused applications at Twitter to receive corrupt data when network hardware was corrupting packets. We believe this was added as an optimization to skip computing and verifying checksums for communication between containers. However, locally generated packets have ip_summed == CHECKSUM_PARTIAL, so the code as written does nothing for them. As far as we can tell, after removing this code, these packets are transmitted from one stack to another unmodified (tcpdump shows invalid checksums on both sides, as expected), and they are delivered correctly to applications. We didn’t test every possible network configuration, but we tried a few common ones such as bridging containers, using NAT between the host and a container, and routing from hardware devices to containers. We have effectively deployed this in production at Twitter (by disabling RX checksum offloading on veth devices). This code dates back to the first version of the driver, commit ("[NET]: Virtual ethernet device driver"), so I suspect this bug occurred mostly because the driver API has evolved significantly since then. Commit <0b7967503dc97864f283a> ("net/veth: Fix packet checksumming") (in December 2010) fixed this for packets that get created locally and sent to hardware devices, by not changing CHECKSUM_PARTIAL. However, the same issue still occurs for packets coming in from hardware devices. Co-authored-by: Evan Jones Signed-off-by: Evan Jones Cc: Nicolas Dichtel Cc: Phil Sutter Cc: Toshiaki Makita Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Vijay Pandurangan Acked-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/veth.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0ef4a5ad5557..ba21d072be31 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -117,12 +117,6 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb(skb); goto drop; } - /* don't change ip_summed == CHECKSUM_PARTIAL, as that - * will cause bad checksum on forwarded packets - */ - if (skb->ip_summed == CHECKSUM_NONE && - rcv->features & NETIF_F_RXCSUM) - skb->ip_summed = CHECKSUM_UNNECESSARY; if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) { struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); From 5cbf20c747987346f8352b156e3f05d3b84ac4ac Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 20 Dec 2015 01:48:04 +0300 Subject: [PATCH 25/67] sh_eth: fix 16-bit descriptor field access endianness too Commit 1299653affa4 ("sh_eth: fix descriptor access endianness") only addressed the 32-bit buffer address field byte-swapping but the driver still accesses 16-bit frame/buffer length descriptor fields without the necessary byte-swapping -- which should affect the big-endian kernels. In order to be able to use {cpu|edmac}_to_{edmac|cpu}(), we need to declare the RX/TX descriptor word 1 as a 32-bit field and use shifts/masking to access the 16-bit subfields (which gets rid of the ugly #ifdef'ery too)... Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/sh_eth.c | 25 +++++++++++--------- drivers/net/ethernet/renesas/sh_eth.h | 33 +++++++++++++-------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index a0eaf50499a2..6a8fc0f341ff 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -1167,6 +1167,7 @@ static void sh_eth_ring_format(struct net_device *ndev) int tx_ringsize = sizeof(*txdesc) * mdp->num_tx_ring; int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN + 32 - 1; dma_addr_t dma_addr; + u32 buf_len; mdp->cur_rx = 0; mdp->cur_tx = 0; @@ -1187,9 +1188,9 @@ static void sh_eth_ring_format(struct net_device *ndev) /* RX descriptor */ rxdesc = &mdp->rx_ring[i]; /* The size of the buffer is a multiple of 32 bytes. */ - rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 32); - dma_addr = dma_map_single(&ndev->dev, skb->data, - rxdesc->buffer_length, + buf_len = ALIGN(mdp->rx_buf_sz, 32); + rxdesc->len = cpu_to_edmac(mdp, buf_len << 16); + dma_addr = dma_map_single(&ndev->dev, skb->data, buf_len, DMA_FROM_DEVICE); if (dma_mapping_error(&ndev->dev, dma_addr)) { kfree_skb(skb); @@ -1220,7 +1221,7 @@ static void sh_eth_ring_format(struct net_device *ndev) mdp->tx_skbuff[i] = NULL; txdesc = &mdp->tx_ring[i]; txdesc->status = cpu_to_edmac(mdp, TD_TFP); - txdesc->buffer_length = 0; + txdesc->len = cpu_to_edmac(mdp, 0); if (i == 0) { /* Tx descriptor address set */ sh_eth_write(ndev, mdp->tx_desc_dma, TDLAR); @@ -1429,7 +1430,8 @@ static int sh_eth_txfree(struct net_device *ndev) if (mdp->tx_skbuff[entry]) { dma_unmap_single(&ndev->dev, edmac_to_cpu(mdp, txdesc->addr), - txdesc->buffer_length, DMA_TO_DEVICE); + edmac_to_cpu(mdp, txdesc->len) >> 16, + DMA_TO_DEVICE); dev_kfree_skb_irq(mdp->tx_skbuff[entry]); mdp->tx_skbuff[entry] = NULL; free_num++; @@ -1439,7 +1441,7 @@ static int sh_eth_txfree(struct net_device *ndev) txdesc->status |= cpu_to_edmac(mdp, TD_TDLE); ndev->stats.tx_packets++; - ndev->stats.tx_bytes += txdesc->buffer_length; + ndev->stats.tx_bytes += edmac_to_cpu(mdp, txdesc->len) >> 16; } return free_num; } @@ -1458,6 +1460,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) u32 desc_status; int skbuff_size = mdp->rx_buf_sz + SH_ETH_RX_ALIGN + 32 - 1; dma_addr_t dma_addr; + u32 buf_len; boguscnt = min(boguscnt, *quota); limit = boguscnt; @@ -1466,7 +1469,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) /* RACT bit must be checked before all the following reads */ dma_rmb(); desc_status = edmac_to_cpu(mdp, rxdesc->status); - pkt_len = rxdesc->frame_length; + pkt_len = edmac_to_cpu(mdp, rxdesc->len) & RD_RFL; if (--boguscnt < 0) break; @@ -1532,7 +1535,8 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) entry = mdp->dirty_rx % mdp->num_rx_ring; rxdesc = &mdp->rx_ring[entry]; /* The size of the buffer is 32 byte boundary. */ - rxdesc->buffer_length = ALIGN(mdp->rx_buf_sz, 32); + buf_len = ALIGN(mdp->rx_buf_sz, 32); + rxdesc->len = cpu_to_edmac(mdp, buf_len << 16); if (mdp->rx_skbuff[entry] == NULL) { skb = netdev_alloc_skb(ndev, skbuff_size); @@ -1540,8 +1544,7 @@ static int sh_eth_rx(struct net_device *ndev, u32 intr_status, int *quota) break; /* Better luck next round. */ sh_eth_set_receive_align(skb); dma_addr = dma_map_single(&ndev->dev, skb->data, - rxdesc->buffer_length, - DMA_FROM_DEVICE); + buf_len, DMA_FROM_DEVICE); if (dma_mapping_error(&ndev->dev, dma_addr)) { kfree_skb(skb); break; @@ -2407,7 +2410,7 @@ static int sh_eth_start_xmit(struct sk_buff *skb, struct net_device *ndev) return NETDEV_TX_OK; } txdesc->addr = cpu_to_edmac(mdp, dma_addr); - txdesc->buffer_length = skb->len; + txdesc->len = cpu_to_edmac(mdp, skb->len << 16); dma_wmb(); /* TACT bit must be set after all the above writes */ if (entry >= mdp->num_tx_ring - 1) diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index 26ad1cf0bcf1..72fcfc924589 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -283,7 +283,7 @@ enum DMAC_IM_BIT { DMAC_M_RINT1 = 0x00000001, }; -/* Receive descriptor bit */ +/* Receive descriptor 0 bits */ enum RD_STS_BIT { RD_RACT = 0x80000000, RD_RDLE = 0x40000000, RD_RFP1 = 0x20000000, RD_RFP0 = 0x10000000, @@ -298,6 +298,12 @@ enum RD_STS_BIT { #define RDFEND RD_RFP0 #define RD_RFP (RD_RFP1|RD_RFP0) +/* Receive descriptor 1 bits */ +enum RD_LEN_BIT { + RD_RFL = 0x0000ffff, /* receive frame length */ + RD_RBL = 0xffff0000, /* receive buffer length */ +}; + /* FCFTR */ enum FCFTR_BIT { FCFTR_RFF2 = 0x00040000, FCFTR_RFF1 = 0x00020000, @@ -307,7 +313,7 @@ enum FCFTR_BIT { #define DEFAULT_FIFO_F_D_RFF (FCFTR_RFF2 | FCFTR_RFF1 | FCFTR_RFF0) #define DEFAULT_FIFO_F_D_RFD (FCFTR_RFD2 | FCFTR_RFD1 | FCFTR_RFD0) -/* Transmit descriptor bit */ +/* Transmit descriptor 0 bits */ enum TD_STS_BIT { TD_TACT = 0x80000000, TD_TDLE = 0x40000000, TD_TFP1 = 0x20000000, TD_TFP0 = 0x10000000, @@ -317,6 +323,11 @@ enum TD_STS_BIT { #define TDFEND TD_TFP0 #define TD_TFP (TD_TFP1|TD_TFP0) +/* Transmit descriptor 1 bits */ +enum TD_LEN_BIT { + TD_TBL = 0xffff0000, /* transmit buffer length */ +}; + /* RMCR */ enum RMCR_BIT { RMCR_RNC = 0x00000001, @@ -425,15 +436,9 @@ enum TSU_FWSLC_BIT { */ struct sh_eth_txdesc { u32 status; /* TD0 */ -#if defined(__LITTLE_ENDIAN) - u16 pad0; /* TD1 */ - u16 buffer_length; /* TD1 */ -#else - u16 buffer_length; /* TD1 */ - u16 pad0; /* TD1 */ -#endif + u32 len; /* TD1 */ u32 addr; /* TD2 */ - u32 pad1; /* padding data */ + u32 pad0; /* padding data */ } __aligned(2) __packed; /* The sh ether Rx buffer descriptors. @@ -441,13 +446,7 @@ struct sh_eth_txdesc { */ struct sh_eth_rxdesc { u32 status; /* RD0 */ -#if defined(__LITTLE_ENDIAN) - u16 frame_length; /* RD1 */ - u16 buffer_length; /* RD1 */ -#else - u16 buffer_length; /* RD1 */ - u16 frame_length; /* RD1 */ -#endif + u32 len; /* RD1 */ u32 addr; /* RD2 */ u32 pad0; /* padding data */ } __aligned(2) __packed; From ef9cdd0fed3875b1ae9cc85987d8143354b2d4c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 21 Dec 2015 09:56:01 +0100 Subject: [PATCH 26/67] switchdev: bridge: Pass ageing time as clock_t instead of jiffies The bridge's ageing time is offloaded to hardware when: 1) A port joins a bridge 2) The ageing time of the bridge is changed In the first case the ageing time is offloaded as jiffies, but in the second case it's offloaded as clock_t, which is what existing switchdev drivers expect to receive. Fixes: 6ac311ae8bfb ("Adding switchdev ageing notification on port bridged") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 5396ff08af32..12045dea276c 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -39,7 +39,7 @@ void br_init_port(struct net_bridge_port *p) struct switchdev_attr attr = { .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, - .u.ageing_time = p->br->ageing_time, + .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time), }; int err; From e459dfeeb64008b2d23bdf600f03b3605dbb8152 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 21 Dec 2015 12:54:45 +0300 Subject: [PATCH 27/67] ipv6/addrlabel: fix ip6addrlbl_get() ip6addrlbl_get() has never worked. If ip6addrlbl_hold() succeeded, ip6addrlbl_get() will exit with '-ESRCH'. If ip6addrlbl_hold() failed, ip6addrlbl_get() will use about to be free ip6addrlbl_entry pointer. Fix this by inverting ip6addrlbl_hold() check. Fixes: 2a8cc6c89039 ("[IPV6] ADDRCONF: Support RFC3484 configurable address selection policy table.") Signed-off-by: Andrey Ryabinin Reviewed-by: Cong Wang Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 882124ebb438..a8f6986dcbe5 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -552,7 +552,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) rcu_read_lock(); p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && ip6addrlbl_hold(p)) + if (p && !ip6addrlbl_hold(p)) p = NULL; lseq = ip6addrlbl_table.seq; rcu_read_unlock(); From 5449a5ca9bc27dd51a462de7ca0b1cd861cd2bd0 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 21 Dec 2015 10:55:45 -0800 Subject: [PATCH 28/67] addrconf: always initialize sysctl table data When sysctl performs restrict writes, it allows to write from a middle position of a sysctl file, which requires us to initialize the table data before calling proc_dostring() for the write case. Fixes: 3d1bec99320d ("ipv6: introduce secret_stable to ipv6_devconf") Reported-by: Sasha Levin Acked-by: Hannes Frederic Sowa Tested-by: Sasha Levin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 17f8e7ea133b..1f21087accab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5369,13 +5369,10 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, goto out; } - if (!write) { - err = snprintf(str, sizeof(str), "%pI6", - &secret->secret); - if (err >= sizeof(str)) { - err = -EIO; - goto out; - } + err = snprintf(str, sizeof(str), "%pI6", &secret->secret); + if (err >= sizeof(str)) { + err = -EIO; + goto out; } err = proc_dostring(&lctl, write, buffer, lenp, ppos); From d3805611130af9b911e908af9f67a3f64f4f0914 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 22 Dec 2015 15:48:44 -0700 Subject: [PATCH 29/67] block: Split bios on chunk boundaries For h/w that advertise their block storage's underlying chunk size, it's a big performance win to not submit commands that cross them. This patch uses that criteria if it is provided. If it is not provided, this patch uses the max sectors as before. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e01405a3e8b3..e73846a3d08a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -81,7 +81,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, struct bio *new = NULL; bio_for_each_segment(bv, bio, iter) { - if (sectors + (bv.bv_len >> 9) > queue_max_sectors(q)) + if (sectors + (bv.bv_len >> 9) > blk_max_size_offset(q, bio->bi_iter.bi_sector)) goto split; /* From fac51590c1a077809984139e9bb9e06ed366f219 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Mon, 21 Dec 2015 17:01:24 +0200 Subject: [PATCH 30/67] IB/cma: cma_match_net_dev needs to take into account port_num Previously, cma_match_net_dev called cma_protocol_roce which tried to verify that the IB device uses RoCE protocol. However, if rdma_id wasn't bound to a port, then the check would occur against the first port of the device without regard to whether that port was even of the same type as the type of port the incoming packet was received on. Fix this by passing the port of the request and only checking against the same port of the device. Reported-by: Or Gerlitz Fixes: b8cab5dab15f ('IB/cma: Accept connection without a valid netdev on RoCE') Signed-off-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d2d5d004f16d..2d762a2ecd81 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1265,15 +1265,17 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id) return cma_protocol_roce_dev_port(device, port_num); } -static bool cma_match_net_dev(const struct rdma_id_private *id_priv, - const struct net_device *net_dev) +static bool cma_match_net_dev(const struct rdma_cm_id *id, + const struct net_device *net_dev, + u8 port_num) { - const struct rdma_addr *addr = &id_priv->id.route.addr; + const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request or a RoCE request */ - return addr->src_addr.ss_family == AF_IB || - cma_protocol_roce(&id_priv->id); + return (!id->port_num || id->port_num == port_num) && + (addr->src_addr.ss_family == AF_IB || + cma_protocol_roce_dev_port(id->device, port_num)); return !addr->dev_addr.bound_dev_if || (net_eq(dev_net(net_dev), addr->dev_addr.net) && @@ -1295,13 +1297,13 @@ static struct rdma_id_private *cma_find_listener( hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(id_priv, net_dev)) + cma_match_net_dev(&id_priv->id, net_dev, req->port)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(id_priv_dev, net_dev)) + cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) return id_priv_dev; } } From df4176677fdf7ac0c5748083eb7a5b269fb1e156 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Thu, 17 Dec 2015 10:54:15 +0800 Subject: [PATCH 31/67] IB/mlx4: Replace kfree with kvfree in mlx4_ib_destroy_srq Commit 0ef2f05c7e02ff99c0b5b583d7dee2cd12b053f2 uses vmalloc for WR buffers when needed and uses kvfree to free the buffers. It missed changing kfree to kvfree in mlx4_ib_destroy_srq(). Reported-by: Matthew Finaly Signed-off-by: Wengang Wang Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/srq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 8d133c40fa0e..c394376ebe06 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -286,7 +286,7 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq) mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { - kfree(msrq->wrid); + kvfree(msrq->wrid); mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, &msrq->buf); mlx4_db_free(dev->dev, &msrq->db); From ae35b56e367b9fef7f5de701cf8c1c3dd954dded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Thu, 10 Dec 2015 18:22:31 +0200 Subject: [PATCH 32/67] drm/i915: Unbreak check_digital_port_conflicts() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Atomic changes broke check_digital_port_conflicts(). It needs to look at the global situation instead of just trying to find a conflict within the current atomic state. This bug made my HSW explode spectacularly after I had split the DDI encoders into separate DP and HDMI encoders. With the fix, things seem much more solid. I hope holding the connection_mutex is enough protection that we can actually walk the connectors even if they're not part of the current atomic state... v2: Regenerate the patch so that it actually applies (Jani) Cc: stable@vger.kernel.org Cc: Ander Conselvan de Oliveira Fixes: 5448a00d3f06 ("drm/i915: Don't use staged config in check_digital_port_conflicts()") Signed-off-by: Ville Syrjälä Reviewed-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1449764551-12466-1-git-send-email-ville.syrjala@linux.intel.com (cherry picked from commit 0bff4858653312a10c83709e0009c3adb87e6f1e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index beb0374a19f1..32cf97346978 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -12123,18 +12123,22 @@ static void intel_dump_pipe_config(struct intel_crtc *crtc, static bool check_digital_port_conflicts(struct drm_atomic_state *state) { struct drm_device *dev = state->dev; - struct intel_encoder *encoder; struct drm_connector *connector; - struct drm_connector_state *connector_state; unsigned int used_ports = 0; - int i; /* * Walk the connector list instead of the encoder * list to detect the problem on ddi platforms * where there's just one encoder per digital port. */ - for_each_connector_in_state(state, connector, connector_state, i) { + drm_for_each_connector(connector, dev) { + struct drm_connector_state *connector_state; + struct intel_encoder *encoder; + + connector_state = drm_atomic_get_existing_connector_state(state, connector); + if (!connector_state) + connector_state = connector->state; + if (!connector_state->best_encoder) continue; From c1a9a291cee0890eb0f435243f3fb84fefb04348 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Wed, 23 Dec 2015 22:44:37 +0100 Subject: [PATCH 33/67] ipv6: honor ifindex in case we receive ll addresses in router advertisements Marc Haber reported we don't honor interface indexes when we receive link local router addresses in router advertisements. Luckily the non-strict version of ipv6_chk_addr already does the correct job here, so we can simply use it to lighten the checks and use those addresses by default without any configuration change. Link: Reported-by: Marc Haber Cc: Marc Haber Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d6161e1c48c8..84afb9a77278 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1183,7 +1183,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) */ if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1337,7 +1337,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - NULL, 0)) { + in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: router info ignored.\n", skb->dev->name); From 184fc8b5ee601cd83dbbdf3e6cfec5f5b8d3b41a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 23 Dec 2015 16:54:27 +0100 Subject: [PATCH 34/67] geneve: initialize needed_headroom Currently the needed_headroom field for the geneve device is left to the default value. This patch set it to space required for basic geneve encapsulation, so that we can avoid the skb head re-allocation on xmit. This give a 6% speedup for unsegment traffic on geneve tunnel. v1 -> v2: - add ETH_HLEN for the lower device to the needed headroom Signed-off-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Acked-by: John W. Linville Signed-off-by: David S. Miller --- drivers/net/geneve.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index c2b79f5d1c89..58efdec12f30 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1155,7 +1155,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); bool tun_collect_md, tun_on_same_port; - int err; + int err, encap_len; if (!remote) return -EINVAL; @@ -1187,6 +1187,14 @@ static int geneve_configure(struct net *net, struct net_device *dev, if (t) return -EBUSY; + /* make enough headroom for basic scenario */ + encap_len = GENEVE_BASE_HLEN + ETH_HLEN; + if (remote->sa.sa_family == AF_INET) + encap_len += sizeof(struct iphdr); + else + encap_len += sizeof(struct ipv6hdr); + dev->needed_headroom = encap_len + ETH_HLEN; + if (metadata) { if (tun_on_same_port) return -EPERM; From 1dfddff5fcd869fcab0c52fafae099dfa435a935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Wed, 23 Dec 2015 13:42:43 +0100 Subject: [PATCH 35/67] net: cdc_ncm: avoid changing RX/TX buffers on MTU changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NCM buffer sizes are negotiated with the device independently of the network device MTU. The RX buffers are allocated by the usbnet framework based on the rx_urb_size value set by cdc_ncm. A single RX buffer can hold a number of MTU sized packets. The default usbnet change_mtu ndo only modifies rx_urb_size if it is equal to hard_mtu. And the cdc_ncm driver will set rx_urb_size and hard_mtu independently of each other, based on dwNtbInMaxSize and dwNtbOutMaxSize respectively. It was therefore assumed that usbnet_change_mtu() would never touch rx_urb_size. This failed to consider the case where dwNtbInMaxSize and dwNtbOutMaxSize happens to be equal. Fix by implementing an NCM specific change_mtu ndo, modifying the netdev MTU without touching the buffer size settings. Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/cdc_mbim.c | 2 +- drivers/net/usb/cdc_ncm.c | 31 +++++++++++++++++++++++++++++++ include/linux/usb/cdc_ncm.h | 1 + 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 8973abdec9f6..bdd83d95ec0a 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -100,7 +100,7 @@ static const struct net_device_ops cdc_mbim_netdev_ops = { .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, - .ndo_change_mtu = usbnet_change_mtu, + .ndo_change_mtu = cdc_ncm_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = cdc_mbim_rx_add_vid, diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 369405271437..e8a1144c5a8b 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -689,6 +690,33 @@ static void cdc_ncm_free(struct cdc_ncm_ctx *ctx) kfree(ctx); } +/* we need to override the usbnet change_mtu ndo for two reasons: + * - respect the negotiated maximum datagram size + * - avoid unwanted changes to rx and tx buffers + */ +int cdc_ncm_change_mtu(struct net_device *net, int new_mtu) +{ + struct usbnet *dev = netdev_priv(net); + struct cdc_ncm_ctx *ctx = (struct cdc_ncm_ctx *)dev->data[0]; + int maxmtu = ctx->max_datagram_size - cdc_ncm_eth_hlen(dev); + + if (new_mtu <= 0 || new_mtu > maxmtu) + return -EINVAL; + net->mtu = new_mtu; + return 0; +} +EXPORT_SYMBOL_GPL(cdc_ncm_change_mtu); + +static const struct net_device_ops cdc_ncm_netdev_ops = { + .ndo_open = usbnet_open, + .ndo_stop = usbnet_stop, + .ndo_start_xmit = usbnet_start_xmit, + .ndo_tx_timeout = usbnet_tx_timeout, + .ndo_change_mtu = cdc_ncm_change_mtu, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags) { struct cdc_ncm_ctx *ctx; @@ -823,6 +851,9 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_ /* add our sysfs attrs */ dev->net->sysfs_groups[0] = &cdc_ncm_sysfs_attr_group; + /* must handle MTU changes */ + dev->net->netdev_ops = &cdc_ncm_netdev_ops; + return 0; error2: diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 1f6526c76ee8..3a375d07d0dc 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -138,6 +138,7 @@ struct cdc_ncm_ctx { }; u8 cdc_ncm_select_altsetting(struct usb_interface *intf); +int cdc_ncm_change_mtu(struct net_device *net, int new_mtu); int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags); void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf); struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign); From 3358a5c0c1578fa215f90a0e750579cd6258ddd9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 24 Dec 2015 12:21:22 +0300 Subject: [PATCH 36/67] qlcnic: fix a loop exit condition better In the original code, if we succeeded on the last iteration through the loop then we still returned failure. Fixes: 389e4e04ad2d ('qlcnic: fix a timeout loop') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c index b1a452f291ee..34906750b7e7 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_vnic.c @@ -252,7 +252,7 @@ int qlcnic_83xx_check_vnic_state(struct qlcnic_adapter *adapter) state = QLCRDX(ahw, QLC_83XX_VNIC_STATE); } - if (!idc->vnic_wait_limit) { + if (state != QLCNIC_DEV_NPAR_OPER) { dev_err(&adapter->pdev->dev, "vNIC mode not operational, state check timed out.\n"); return -EIO; From 9ba0b9636dc07a328ad3bffe9b22edb4cbb2901b Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 23 Dec 2015 16:28:40 -0200 Subject: [PATCH 37/67] sctp: use GFP_USER for user-controlled kmalloc Commit cacc06215271 ("sctp: use GFP_USER for user-controlled kmalloc") missed two other spots. For connectx, as it's more likely to be used by kernel users of the API, it detects if GFP_USER should be used or not. Fixes: cacc06215271 ("sctp: use GFP_USER for user-controlled kmalloc") Reported-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9b6cc6de80d8..570f96ad4527 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1301,8 +1301,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, int addrs_size, sctp_assoc_t *assoc_id) { - int err = 0; struct sockaddr *kaddrs; + gfp_t gfp = GFP_KERNEL; + int err = 0; pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n", __func__, sk, addrs, addrs_size); @@ -1315,7 +1316,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk, return -EFAULT; /* Alloc space for the address array in kernel memory. */ - kaddrs = kmalloc(addrs_size, GFP_KERNEL); + if (sk->sk_socket->file) + gfp = GFP_USER | __GFP_NOWARN; + kaddrs = kmalloc(addrs_size, gfp); if (unlikely(!kaddrs)) return -ENOMEM; @@ -5773,7 +5776,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len, len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num; - ids = kmalloc(len, GFP_KERNEL); + ids = kmalloc(len, GFP_USER | __GFP_NOWARN); if (unlikely(!ids)) return -ENOMEM; From 3538a5c8ffa37c715029af4a2e384c077558eb18 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Wed, 23 Dec 2015 16:44:09 -0200 Subject: [PATCH 38/67] sctp: label accepted/peeled off sockets Accepted or peeled off sockets were missing a security label (e.g. SELinux) which means that socket was in "unlabeled" state. This patch clones the sock's label from the parent sock and resolves the issue (similar to AF_BLUETOOTH protocol family). Cc: Paul Moore Cc: David Teigland Signed-off-by: Marcelo Ricardo Leitner Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 570f96ad4527..529ed357a2cf 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7202,6 +7202,8 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); + + security_sk_clone(sk, newsk); } static inline void sctp_copy_descendant(struct sock *sk_to, From c6002d5602ab36c19ef4fe0e20ecfa28aaabf028 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 24 Dec 2015 13:14:05 -0500 Subject: [PATCH 39/67] RDMA/ocrdma: Fix vlan-id assignment in qp parameters vlan-id is wrongly getting as 0 when PFC is enabled. Set vlan-id configured by user in QP parameters. In case vlan interface is not used, flash a warning to user to configure vlan and assign vlan-id as 0 in qp params. Fixes: dbf727de7440 ('IB/core: Use GID table in AH creation and dmac resolution') Cc: Matan Barak Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 30f67bebffa3..4fc2bb49c28e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2515,9 +2515,10 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid)); cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8); - if (vlan_id < 0x1000) { - if (dev->pfc_state) { - vlan_id = 0; + if (vlan_id == 0xFFFF) + vlan_id = 0; + if (vlan_id || dev->pfc_state) { + if (!vlan_id) { pr_err("ocrdma%d:Using VLAN with PFC is recommended\n", dev->id); pr_err("ocrdma%d:Using VLAN 0 for this connection\n", From 36ac0db0dbf7081afe4137d444ef85614213b8eb Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 24 Dec 2015 13:14:06 -0500 Subject: [PATCH 40/67] RDMA/ocrdma: Dispatch only port event when port state changes Dispatch only port event to IB stack when port state changes. Don't explicitly modify qps to error. Let application listen to port events on async event queue or let QP fail with retry-exceeded completion error. Signed-off-by: Padmanabh Ratnakar Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 23 ---------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 62b7009daa6c..ebe40b414c9d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -386,30 +386,7 @@ static int ocrdma_open(struct ocrdma_dev *dev) static int ocrdma_close(struct ocrdma_dev *dev) { - int i; - struct ocrdma_qp *qp, **cur_qp; struct ib_event err_event; - struct ib_qp_attr attrs; - int attr_mask = IB_QP_STATE; - - attrs.qp_state = IB_QPS_ERR; - mutex_lock(&dev->dev_lock); - if (dev->qp_tbl) { - cur_qp = dev->qp_tbl; - for (i = 0; i < OCRDMA_MAX_QP; i++) { - qp = cur_qp[i]; - if (qp && qp->ibqp.qp_type != IB_QPT_GSI) { - /* change the QP state to ERROR */ - _ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask); - - err_event.event = IB_EVENT_QP_FATAL; - err_event.element.qp = &qp->ibqp; - err_event.device = &dev->ibdev; - ib_dispatch_event(&err_event); - } - } - } - mutex_unlock(&dev->dev_lock); err_event.event = IB_EVENT_PORT_ERR; err_event.element.port_num = 1; From 10a214dc996236e6547b84fb5ca007316b30c2e6 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 24 Dec 2015 13:14:07 -0500 Subject: [PATCH 41/67] RDMA/ocrdma: Depend on async link events from CNA Recently Dough Ledford reported a deadlock happening between ocrdma-load sequence and NetworkManager service issuing "open" on be2net interface. The deadlock happens when any be2net hook (e.g. open/close) is called in parallel to insmod ocrdma.ko. A. be2net is sending administrative open/close event to ocrdma holding device_list_mutex. It does this from ndo_open/ndo_stop hooks of be2net. So sequence of locks is rtnl_lock---> device_list lock B. When new ocrdma roce device gets registered, infiniband stack now takes rtnl_lock in ib_register_device() in GID initialization routines. So sequence of locks in this path is device_list lock ---> rtnl_lock. This improper locking sequence causes deadlock. With this patch we stop using administrative open and close events injected by be2net driver. These events were used to dispatch PORT_ACTIVE and PORT_ERROR events to the IB-stack. This patch implements a logic to receive async-link-events generated from CNA whenever link-state-change is detected. Now on, these async-events will be used to dispatch PORT_ACTIVE and PORT_ERROR events to IB-stack. Depending on async-events from CNA removes the need to hold device-list-mutex and thus breaks the busy-wait scenario. Reported-by: Doug Ledford CC: Sathya Perla Signed-off-by: Padmanabh Ratnakar Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/ocrdma/ocrdma.h | 10 +++++ drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 42 +++++++++++++++--- drivers/infiniband/hw/ocrdma/ocrdma_hw.h | 4 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 34 ++++++++++---- drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 49 +++++++++++++++++++-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 6 files changed, 119 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index ae80590aabdf..040bb8b5cb15 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -232,6 +232,10 @@ struct phy_info { u16 interface_type; }; +enum ocrdma_flags { + OCRDMA_FLAGS_LINK_STATUS_INIT = 0x01 +}; + struct ocrdma_dev { struct ib_device ibdev; struct ocrdma_dev_attr attr; @@ -287,6 +291,7 @@ struct ocrdma_dev { atomic_t update_sl; u16 pvid; u32 asic_id; + u32 flags; ulong last_stats_time; struct mutex stats_lock; /* provide synch for debugfs operations */ @@ -591,4 +596,9 @@ static inline u8 ocrdma_is_enabled_and_synced(u32 state) (state & OCRDMA_STATE_FLAG_SYNC); } +static inline u8 ocrdma_get_ae_link_state(u32 ae_state) +{ + return ((ae_state & OCRDMA_AE_LSC_LS_MASK) >> OCRDMA_AE_LSC_LS_SHIFT); +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 4fc2bb49c28e..283ca842ff74 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -579,6 +579,8 @@ static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev, cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE); cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE); + /* Request link events on this MQ. */ + cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_LINK_EVE_CODE); cmd->async_cqid_ringsize = cq->id; cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) << @@ -819,20 +821,42 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, } } +static void ocrdma_process_link_state(struct ocrdma_dev *dev, + struct ocrdma_ae_mcqe *cqe) +{ + struct ocrdma_ae_lnkst_mcqe *evt; + u8 lstate; + + evt = (struct ocrdma_ae_lnkst_mcqe *)cqe; + lstate = ocrdma_get_ae_link_state(evt->speed_state_ptn); + + if (!(lstate & OCRDMA_AE_LSC_LLINK_MASK)) + return; + + if (dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT) + ocrdma_update_link_state(dev, (lstate & OCRDMA_LINK_ST_MASK)); +} + static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe) { /* async CQE processing */ struct ocrdma_ae_mcqe *cqe = ae_cqe; u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >> OCRDMA_AE_MCQE_EVENT_CODE_SHIFT; - - if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE) + switch (evt_code) { + case OCRDMA_ASYNC_LINK_EVE_CODE: + ocrdma_process_link_state(dev, cqe); + break; + case OCRDMA_ASYNC_RDMA_EVE_CODE: ocrdma_dispatch_ibevent(dev, cqe); - else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE) + break; + case OCRDMA_ASYNC_GRP5_EVE_CODE: ocrdma_process_grp5_aync(dev, cqe); - else + break; + default: pr_err("%s(%d) invalid evt code=0x%x\n", __func__, dev->id, evt_code); + } } static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe) @@ -1363,7 +1387,8 @@ static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev) return status; } -int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed, + u8 *lnk_state) { int status = -ENOMEM; struct ocrdma_get_link_speed_rsp *rsp; @@ -1384,8 +1409,11 @@ int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) goto mbx_err; rsp = (struct ocrdma_get_link_speed_rsp *)cmd; - *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) - >> OCRDMA_PHY_PS_SHIFT; + if (lnk_speed) + *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) + >> OCRDMA_PHY_PS_SHIFT; + if (lnk_state) + *lnk_state = (rsp->res_lnk_st & OCRDMA_LINK_ST_MASK); mbx_err: kfree(cmd); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index 7ed885c1851e..ebc1f442aec3 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -106,7 +106,8 @@ void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed, bool solicited, u16 cqe_popped); /* verbs specific mailbox commands */ -int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed); +int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed, + u8 *lnk_st); int ocrdma_query_config(struct ocrdma_dev *, struct ocrdma_mbx_query_config *config); @@ -153,5 +154,6 @@ char *port_speed_string(struct ocrdma_dev *dev); void ocrdma_init_service_level(struct ocrdma_dev *); void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev); void ocrdma_free_pd_range(struct ocrdma_dev *dev); +void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate); #endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index ebe40b414c9d..3afb40b85159 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -290,6 +290,7 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { int status = 0, i; + u8 lstate = 0; struct ocrdma_dev *dev; dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); @@ -319,6 +320,11 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (status) goto alloc_err; + /* Query Link state and update */ + status = ocrdma_mbx_get_link_speed(dev, NULL, &lstate); + if (!status) + ocrdma_update_link_state(dev, lstate); + for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) goto sysfs_err; @@ -373,7 +379,7 @@ static void ocrdma_remove(struct ocrdma_dev *dev) ocrdma_remove_free(dev); } -static int ocrdma_open(struct ocrdma_dev *dev) +static int ocrdma_dispatch_port_active(struct ocrdma_dev *dev) { struct ib_event port_event; @@ -384,7 +390,7 @@ static int ocrdma_open(struct ocrdma_dev *dev) return 0; } -static int ocrdma_close(struct ocrdma_dev *dev) +static int ocrdma_dispatch_port_error(struct ocrdma_dev *dev) { struct ib_event err_event; @@ -397,7 +403,7 @@ static int ocrdma_close(struct ocrdma_dev *dev) static void ocrdma_shutdown(struct ocrdma_dev *dev) { - ocrdma_close(dev); + ocrdma_dispatch_port_error(dev); ocrdma_remove(dev); } @@ -408,18 +414,28 @@ static void ocrdma_shutdown(struct ocrdma_dev *dev) static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) { switch (event) { - case BE_DEV_UP: - ocrdma_open(dev); - break; - case BE_DEV_DOWN: - ocrdma_close(dev); - break; case BE_DEV_SHUTDOWN: ocrdma_shutdown(dev); break; + default: + break; } } +void ocrdma_update_link_state(struct ocrdma_dev *dev, u8 lstate) +{ + if (!(dev->flags & OCRDMA_FLAGS_LINK_STATUS_INIT)) { + dev->flags |= OCRDMA_FLAGS_LINK_STATUS_INIT; + if (!lstate) + return; + } + + if (!lstate) + ocrdma_dispatch_port_error(dev); + else + ocrdma_dispatch_port_active(dev); +} + static struct ocrdma_driver ocrdma_drv = { .name = "ocrdma_driver", .add = ocrdma_add, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 6a38268bbe9f..99dd6fdf06d7 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -465,8 +465,11 @@ struct ocrdma_ae_qp_mcqe { u32 valid_ae_event; }; -#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 -#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 +enum ocrdma_async_event_code { + OCRDMA_ASYNC_LINK_EVE_CODE = 0x01, + OCRDMA_ASYNC_GRP5_EVE_CODE = 0x05, + OCRDMA_ASYNC_RDMA_EVE_CODE = 0x14 +}; enum ocrdma_async_grp5_events { OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01, @@ -489,6 +492,44 @@ enum OCRDMA_ASYNC_EVENT_TYPE { OCRDMA_MAX_ASYNC_ERRORS }; +struct ocrdma_ae_lnkst_mcqe { + u32 speed_state_ptn; + u32 qos_reason_falut; + u32 evt_tag; + u32 valid_ae_event; +}; + +enum { + OCRDMA_AE_LSC_PORT_NUM_MASK = 0x3F, + OCRDMA_AE_LSC_PT_SHIFT = 0x06, + OCRDMA_AE_LSC_PT_MASK = (0x03 << + OCRDMA_AE_LSC_PT_SHIFT), + OCRDMA_AE_LSC_LS_SHIFT = 0x08, + OCRDMA_AE_LSC_LS_MASK = (0xFF << + OCRDMA_AE_LSC_LS_SHIFT), + OCRDMA_AE_LSC_LD_SHIFT = 0x10, + OCRDMA_AE_LSC_LD_MASK = (0xFF << + OCRDMA_AE_LSC_LD_SHIFT), + OCRDMA_AE_LSC_PPS_SHIFT = 0x18, + OCRDMA_AE_LSC_PPS_MASK = (0xFF << + OCRDMA_AE_LSC_PPS_SHIFT), + OCRDMA_AE_LSC_PPF_MASK = 0xFF, + OCRDMA_AE_LSC_ER_SHIFT = 0x08, + OCRDMA_AE_LSC_ER_MASK = (0xFF << + OCRDMA_AE_LSC_ER_SHIFT), + OCRDMA_AE_LSC_QOS_SHIFT = 0x10, + OCRDMA_AE_LSC_QOS_MASK = (0xFFFF << + OCRDMA_AE_LSC_QOS_SHIFT) +}; + +enum { + OCRDMA_AE_LSC_PLINK_DOWN = 0x00, + OCRDMA_AE_LSC_PLINK_UP = 0x01, + OCRDMA_AE_LSC_LLINK_DOWN = 0x02, + OCRDMA_AE_LSC_LLINK_MASK = 0x02, + OCRDMA_AE_LSC_LLINK_UP = 0x03 +}; + /* mailbox command request and responses */ enum { OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT = 2, @@ -676,7 +717,7 @@ enum { OCRDMA_PHY_PFLT_SHIFT = 0x18, OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000, OCRDMA_QOS_LNKSP_SHIFT = 0x10, - OCRDMA_LLST_MASK = 0xFF, + OCRDMA_LINK_ST_MASK = 0x01, OCRDMA_PLFC_MASK = 0x00000400, OCRDMA_PLFC_SHIFT = 0x8, OCRDMA_PLRFC_MASK = 0x00000200, @@ -691,7 +732,7 @@ struct ocrdma_get_link_speed_rsp { u32 pflt_pps_ld_pnum; u32 qos_lsp; - u32 res_lls; + u32 res_lnk_st; }; enum { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 583001bcfb8f..76e96f97b3f6 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -171,7 +171,7 @@ static inline void get_link_speed_and_width(struct ocrdma_dev *dev, int status; u8 speed; - status = ocrdma_mbx_get_link_speed(dev, &speed); + status = ocrdma_mbx_get_link_speed(dev, &speed, NULL); if (status) speed = OCRDMA_PHYS_LINK_SPEED_ZERO; From f41647ef06536199d3366530da050b411546979d Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 24 Dec 2015 13:14:08 -0500 Subject: [PATCH 42/67] RDMA/be2net: Remove open and close entry points Recently Dough Ledford reported a deadlock happening between ocrdma-load sequence and NetworkManager service issueing "open" on be2net interface. The deadlock happens when any be2net hook (e.g. open/close) is called in parallel to insmod ocrdma.ko. A. be2net is sending administrative open/close event to ocrdma holding device_list_mutex. It does this from ndo_open/ndo_stop hooks of be2net. So sequence of locks is rtnl_lock---> device_list lock B. When new ocrdma roce device gets registered, infiniband stack now takes rtnl_lock in ib_register_device() in GID initialization routines. So sequence of locks in this path is device_list lock ---> rtnl_lock. This improper locking sequence causes deadlock. In order to resolve the above deadlock condition, ocrdma intorduced a patch to stop listening to administrative open/close events generated from be2net driver. It now depends on link-state-change async-event generated from CNA. This change leaves behind dead code which used to generate administrative open/close events. This patch cleans-up all that dead code from be2net. Reported-by: Doug Ledford CC: Sathya Perla Signed-off-by: Padmanabh Ratnakar Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/net/ethernet/emulex/benet/be.h | 2 -- drivers/net/ethernet/emulex/benet/be_main.c | 4 --- drivers/net/ethernet/emulex/benet/be_roce.c | 36 --------------------- drivers/net/ethernet/emulex/benet/be_roce.h | 4 +-- 4 files changed, 1 insertion(+), 45 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index d463563e1f70..6ee78c203eca 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -848,8 +848,6 @@ void be_roce_dev_remove(struct be_adapter *); /* * internal function to open-close roce device during ifup-ifdown. */ -void be_roce_dev_open(struct be_adapter *); -void be_roce_dev_close(struct be_adapter *); void be_roce_dev_shutdown(struct be_adapter *); #endif /* BE_H */ diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index b6ad02909d6b..ff2ff8946671 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -3432,8 +3432,6 @@ static int be_close(struct net_device *netdev) be_disable_if_filters(adapter); - be_roce_dev_close(adapter); - if (adapter->flags & BE_FLAGS_NAPI_ENABLED) { for_all_evt_queues(adapter, eqo, i) { napi_disable(&eqo->napi); @@ -3601,8 +3599,6 @@ static int be_open(struct net_device *netdev) be_link_status_update(adapter, link_status); netif_tx_start_all_queues(netdev); - be_roce_dev_open(adapter); - #ifdef CONFIG_BE2NET_VXLAN if (skyhawk_chip(adapter)) vxlan_get_rx_port(netdev); diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c index 60368207bf58..4089156a7f5e 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.c +++ b/drivers/net/ethernet/emulex/benet/be_roce.c @@ -116,40 +116,6 @@ void be_roce_dev_remove(struct be_adapter *adapter) } } -static void _be_roce_dev_open(struct be_adapter *adapter) -{ - if (ocrdma_drv && adapter->ocrdma_dev && - ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, - BE_DEV_UP); -} - -void be_roce_dev_open(struct be_adapter *adapter) -{ - if (be_roce_supported(adapter)) { - mutex_lock(&be_adapter_list_lock); - _be_roce_dev_open(adapter); - mutex_unlock(&be_adapter_list_lock); - } -} - -static void _be_roce_dev_close(struct be_adapter *adapter) -{ - if (ocrdma_drv && adapter->ocrdma_dev && - ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, - BE_DEV_DOWN); -} - -void be_roce_dev_close(struct be_adapter *adapter) -{ - if (be_roce_supported(adapter)) { - mutex_lock(&be_adapter_list_lock); - _be_roce_dev_close(adapter); - mutex_unlock(&be_adapter_list_lock); - } -} - void be_roce_dev_shutdown(struct be_adapter *adapter) { if (be_roce_supported(adapter)) { @@ -177,8 +143,6 @@ int be_roce_register_driver(struct ocrdma_driver *drv) _be_roce_dev_add(dev); netdev = dev->netdev; - if (netif_running(netdev) && netif_oper_up(netdev)) - _be_roce_dev_open(dev); } mutex_unlock(&be_adapter_list_lock); return 0; diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h index cde6ef905ec4..fde609789483 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.h +++ b/drivers/net/ethernet/emulex/benet/be_roce.h @@ -60,9 +60,7 @@ struct ocrdma_driver { void (*state_change_handler) (struct ocrdma_dev *, u32 new_state); }; -enum { - BE_DEV_UP = 0, - BE_DEV_DOWN = 1, +enum be_roce_event { BE_DEV_SHUTDOWN = 2 }; From 21491412f2ec6f13d4104de734dec0ba659d092e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Dec 2015 13:01:22 -0700 Subject: [PATCH 43/67] block: add blk_start_queue_async() We currently only have an inline/sync helper to restart a stopped queue. If drivers need an async version, they have to roll their own. Add a generic helper instead. Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 17 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index c487b94c59e3..33e2f62d5062 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -206,6 +206,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) } EXPORT_SYMBOL(blk_delay_queue); +/** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0169ba2e2e64..c70e3588a48c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -797,6 +797,7 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int blk_queue_enter(struct request_queue *q, gfp_t gfp); extern void blk_queue_exit(struct request_queue *q); extern void blk_start_queue(struct request_queue *q); +extern void blk_start_queue_async(struct request_queue *q); extern void blk_stop_queue(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); From 48cc661e7f4cec80b6aa894cc6902c292f201ea8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Dec 2015 13:02:47 -0700 Subject: [PATCH 44/67] null_blk: use async queue restart helper If null_blk is run in NULL_IRQ_TIMER mode and with queue_mode NULL_Q_RQ, we need to restart the queue from the hrtimer interrupt. We can't directly invoke the request_fn from that context, so punt the queue run to async kblockd context. Tested-by: Rabin Vincent Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a428e4ef71fd..09e3c0d87ecc 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -232,20 +232,19 @@ static void end_cmd(struct nullb_cmd *cmd) break; case NULL_Q_BIO: bio_endio(cmd->bio); - goto free_cmd; + break; } + free_cmd(cmd); + /* Restart queue if needed, as we are freeing a tag */ - if (q && !q->mq_ops && blk_queue_stopped(q)) { + if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - if (blk_queue_stopped(q)) - blk_start_queue(q); + blk_start_queue_async(q); spin_unlock_irqrestore(q->queue_lock, flags); } -free_cmd: - free_cmd(cmd); } static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) From c3293a9ac2a4f9160b85b5e986a8e0c54986e7f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Tue, 29 Dec 2015 14:37:56 +0100 Subject: [PATCH 45/67] lightnvm: wrong offset in bad blk lun calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dev->nr_luns reports the total number of luns available in a device while dev->luns_per_chnl is the number of luns per channel. When multiple channels are available, the offset is calculated from a channel and lun id into a linear array. As it multiplies with the total number of luns, we go out of bound when channel id > 0 and causes the kernel to panic when we read a protected kernel memory area. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/gennvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index f434e89e1c7a..a54b339951a3 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -75,7 +75,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, struct nvm_block *blk; int i; - lun = &gn->luns[(dev->nr_luns * ppa.g.ch) + ppa.g.lun]; + lun = &gn->luns[(dev->luns_per_chnl * ppa.g.ch) + ppa.g.lun]; for (i = 0; i < nr_blocks; i++) { if (blks[i] == 0) From 76cc404bfdc0d419c720de4daaf2584542734f42 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 28 Dec 2015 20:47:08 -0500 Subject: [PATCH 46/67] [PATCH] arm: fix handling of F_OFD_... in oabi_fcntl64() Cc: stable@vger.kernel.org # 3.15+ Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- arch/arm/kernel/sys_oabi-compat.c | 77 ++++++++++++++++--------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index b83f3b7737fb..087acb569b63 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -193,15 +193,44 @@ struct oabi_flock64 { pid_t l_pid; } __attribute__ ((packed,aligned(4))); +static long do_locks(unsigned int fd, unsigned int cmd, + unsigned long arg) +{ + struct flock64 kernel; + struct oabi_flock64 user; + mm_segment_t fs; + long ret; + + if (copy_from_user(&user, (struct oabi_flock64 __user *)arg, + sizeof(user))) + return -EFAULT; + kernel.l_type = user.l_type; + kernel.l_whence = user.l_whence; + kernel.l_start = user.l_start; + kernel.l_len = user.l_len; + kernel.l_pid = user.l_pid; + + fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl64(fd, cmd, (unsigned long)&kernel); + set_fs(fs); + + if (!ret && (cmd == F_GETLK64 || cmd == F_OFD_GETLK)) { + user.l_type = kernel.l_type; + user.l_whence = kernel.l_whence; + user.l_start = kernel.l_start; + user.l_len = kernel.l_len; + user.l_pid = kernel.l_pid; + if (copy_to_user((struct oabi_flock64 __user *)arg, + &user, sizeof(user))) + ret = -EFAULT; + } + return ret; +} + asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg) { - struct oabi_flock64 user; - struct flock64 kernel; - mm_segment_t fs = USER_DS; /* initialized to kill a warning */ - unsigned long local_arg = arg; - int ret; - switch (cmd) { case F_OFD_GETLK: case F_OFD_SETLK: @@ -209,39 +238,11 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd, case F_GETLK64: case F_SETLK64: case F_SETLKW64: - if (copy_from_user(&user, (struct oabi_flock64 __user *)arg, - sizeof(user))) - return -EFAULT; - kernel.l_type = user.l_type; - kernel.l_whence = user.l_whence; - kernel.l_start = user.l_start; - kernel.l_len = user.l_len; - kernel.l_pid = user.l_pid; - local_arg = (unsigned long)&kernel; - fs = get_fs(); - set_fs(KERNEL_DS); + return do_locks(fd, cmd, arg); + + default: + return sys_fcntl64(fd, cmd, arg); } - - ret = sys_fcntl64(fd, cmd, local_arg); - - switch (cmd) { - case F_GETLK64: - if (!ret) { - user.l_type = kernel.l_type; - user.l_whence = kernel.l_whence; - user.l_start = kernel.l_start; - user.l_len = kernel.l_len; - user.l_pid = kernel.l_pid; - if (copy_to_user((struct oabi_flock64 __user *)arg, - &user, sizeof(user))) - ret = -EFAULT; - } - case F_SETLK64: - case F_SETLKW64: - set_fs(fs); - } - - return ret; } struct oabi_epoll_event { From 90c7afc96cbbd77f44094b5b651261968e97de67 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 23 Dec 2015 14:39:27 -0800 Subject: [PATCH 47/67] openvswitch: Fix template leak in error cases. Commit 5b48bb8506c5 ("openvswitch: Fix helper reference leak") fixed a reference leak on helper objects, but inadvertently introduced a leak on the ct template. Previously, ct_info.ct->general.use was initialized to 0 by nf_ct_tmpl_alloc() and only incremented when ovs_ct_copy_action() returned successful. If an error occurred while adding the helper or adding the action to the actions buffer, the __ovs_ct_free_action() cleanup would use nf_ct_put() to free the entry; However, this relies on atomic_dec_and_test(ct_info.ct->general.use). This reference must be incremented first, or nf_ct_put() will never free it. Fix the issue by acquiring a reference to the template immediately after allocation. Fixes: cae3a2627520 ("openvswitch: Allow attaching helpers to ct action") Fixes: 5b48bb8506c5 ("openvswitch: Fix helper reference leak") Signed-off-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3e8892216f94..e004067ec24a 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -698,6 +698,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + if (helper) { err = ovs_ct_add_helper(&ct_info, helper, key, log); if (err) @@ -709,8 +713,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); From c1e3334fa4b2891752f1367b47a60209353ba2f5 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 26 Dec 2015 20:12:13 +0100 Subject: [PATCH 48/67] drivers: net: cpsw: fix error return code Propagate the return value of platform_get_irq on failure. A simplified version of the semantic match that finds the two cases where no error code is returned at all is as follows: (http://coccinelle.lip6.fr/) // @@ identifier ret; expression e1,e2; @@ ( if (\(ret < 0\|ret != 0\)) { ... return ret; } | ret = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 3b489caea096..fc958067d10a 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -2427,7 +2427,7 @@ static int cpsw_probe(struct platform_device *pdev) ndev->irq = platform_get_irq(pdev, 1); if (ndev->irq < 0) { dev_err(priv->dev, "error getting irq resource\n"); - ret = -ENOENT; + ret = ndev->irq; goto clean_ale_ret; } @@ -2448,8 +2448,10 @@ static int cpsw_probe(struct platform_device *pdev) /* RX IRQ */ irq = platform_get_irq(pdev, 1); - if (irq < 0) + if (irq < 0) { + ret = irq; goto clean_ale_ret; + } priv->irqs_table[0] = irq; ret = devm_request_irq(&pdev->dev, irq, cpsw_rx_interrupt, @@ -2461,8 +2463,10 @@ static int cpsw_probe(struct platform_device *pdev) /* TX IRQ */ irq = platform_get_irq(pdev, 2); - if (irq < 0) + if (irq < 0) { + ret = irq; goto clean_ale_ret; + } priv->irqs_table[1] = irq; ret = devm_request_irq(&pdev->dev, irq, cpsw_tx_interrupt, From 398c7500a1f5f74e207bd2edca1b1721b3cc1f1e Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 23 Dec 2015 21:04:31 -0800 Subject: [PATCH 49/67] MIPS: VDSO: Fix build error with binutils 2.24 and earlier Commit 2a037f310bab ("MIPS: VDSO: Fix build error") tries to fix a build error seen with binutils 2.24 and earlier. However, the fix does not work, and again results in the already known build errors if the kernel is built with an earlier version of binutils. CC arch/mips/vdso/gettimeofday.o /tmp/ccnOVbHT.s: Assembler messages: /tmp/ccnOVbHT.s:50: Error: can't resolve `_start' {*UND* section} - `L0 {.text section} /tmp/ccnOVbHT.s:374: Error: can't resolve `_start' {*UND* section} - `L0 {.text section} scripts/Makefile.build:258: recipe for target 'arch/mips/vdso/gettimeofday.o' failed make[2]: *** [arch/mips/vdso/gettimeofday.o] Error 1 Fixes: 2a037f310bab ("MIPS: VDSO: Fix build error") Cc: Qais Yousef Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/11926/ Signed-off-by: Guenter Roeck Signed-off-by: Ralf Baechle --- arch/mips/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 018f8c7b94f2..14568900fc1d 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -26,7 +26,7 @@ aflags-vdso := $(ccflags-vdso) \ # the comments on that file. # ifndef CONFIG_CPU_MIPSR6 - ifeq ($(call ld-ifversion, -lt, 22500000, y),) + ifeq ($(call ld-ifversion, -lt, 22500000, y),y) $(warning MIPS VDSO requires binutils >= 2.25) obj-vdso-y := $(filter-out gettimeofday.o, $(obj-vdso-y)) ccflags-vdso += -DDISABLE_MIPS_VDSO From 5c9ee4cbf2a945271f25b89b137f2c03bbc3be33 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Tue, 29 Dec 2015 14:54:06 -0800 Subject: [PATCH 50/67] ocfs2: fix BUG when calculate new backup super When resizing, it firstly extends the last gd. Once it should backup super in the gd, it calculates new backup super and update the corresponding value. But it currently doesn't consider the situation that the backup super is already done. And in this case, it still sets the bit in gd bitmap and then decrease from bg_free_bits_count, which leads to a corrupted gd and trigger the BUG in ocfs2_block_group_set_bits: BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); So check whether the backup super is done and then do the updates. Signed-off-by: Joseph Qi Reviewed-by: Jiufei Xue Reviewed-by: Yiwen Jiang Cc: Mark Fasheh Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/resize.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d5da6f624142..79b8021302b3 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -54,11 +54,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, u16 cl_cpg, + u16 old_bg_clusters, int set) { int i; u16 backups = 0; - u32 cluster; + u32 cluster, lgd_cluster; u64 blkno, gd_blkno, lgd_blkno = le64_to_cpu(gd->bg_blkno); for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) { @@ -71,6 +72,12 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode, else if (gd_blkno > lgd_blkno) break; + /* check if already done backup super */ + lgd_cluster = ocfs2_blocks_to_clusters(inode->i_sb, lgd_blkno); + lgd_cluster += old_bg_clusters; + if (lgd_cluster >= cluster) + continue; + if (set) ocfs2_set_bit(cluster % cl_cpg, (unsigned long *)gd->bg_bitmap); @@ -99,6 +106,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 chain, num_bits, backups = 0; u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); + u16 old_bg_clusters; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -112,6 +120,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, group = (struct ocfs2_group_desc *)group_bh->b_data; + old_bg_clusters = le16_to_cpu(group->bg_bits) / cl_bpc; /* update the group first. */ num_bits = new_clusters * cl_bpc; le16_add_cpu(&group->bg_bits, num_bits); @@ -125,7 +134,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 1); + cl_cpg, old_bg_clusters, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -163,7 +172,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - cl_cpg, 0); + cl_cpg, old_bg_clusters, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); From 6df38689e0e9a07ff4f42c06b302e203b33667e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 29 Dec 2015 14:54:10 -0800 Subject: [PATCH 51/67] mm: memcontrol: fix possible memcg leak due to interrupted reclaim Memory cgroup reclaim can be interrupted with mem_cgroup_iter_break() once enough pages have been reclaimed, in which case, in contrast to a full round-trip over a cgroup sub-tree, the current position stored in mem_cgroup_reclaim_iter of the target cgroup does not get invalidated and so is left holding the reference to the last scanned cgroup. If the target cgroup does not get scanned again (we might have just reclaimed the last page or all processes might exit and free their memory voluntary), we will leak it, because there is nobody to put the reference held by the iterator. The problem is easy to reproduce by running the following command sequence in a loop: mkdir /sys/fs/cgroup/memory/test echo 100M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/memory/test/cgroup.procs memhog 150M echo $$ > /sys/fs/cgroup/memory/cgroup.procs rmdir test The cgroups generated by it will never get freed. This patch fixes this issue by making mem_cgroup_iter avoid taking reference to the current position. In order not to hit use-after-free bug while running reclaim in parallel with cgroup deletion, we make use of ->css_released cgroup callback to clear references to the dying cgroup in all reclaim iterators that might refer to it. This callback is called right before scheduling rcu work which will free css, so if we access iter->position from rcu read section, we might be sure it won't go away under us. [hannes@cmpxchg.org: clean up css ref handling] Fixes: 5ac8fb31ad2e ("mm: memcontrol: convert reclaim iterator to simple css refcounting") Signed-off-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [3.19+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 60 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e234c21a5e6c..fc10620967c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -903,14 +903,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && reclaim->generation != iter->generation) goto out_unlock; - do { + while (1) { pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; /* - * A racing update may change the position and - * put the last reference, hence css_tryget(), - * or retry to see the updated position. + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. */ - } while (pos && !css_tryget(&pos->css)); + (void)cmpxchg(&iter->position, pos, NULL); + } } if (pos) @@ -956,17 +962,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } if (reclaim) { - if (cmpxchg(&iter->position, pos, memcg) == pos) { - if (memcg) - css_get(&memcg->css); - if (pos) - css_put(&pos->css); - } - /* - * pairs with css_tryget when dereferencing iter->position - * above. + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. */ + (void)cmpxchg(&iter->position, pos, memcg); + if (pos) css_put(&pos->css); @@ -999,6 +1001,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_zone *mz; + int nid, zid; + int i; + + while ((memcg = parent_mem_cgroup(memcg))) { + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); + } + } + } + } +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -4324,6 +4348,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) wb_memcg_offline(memcg); } +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); +} + static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); @@ -5185,6 +5216,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, From facca61683f937f31f90307cc64851436c8a3e21 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 29 Dec 2015 14:54:13 -0800 Subject: [PATCH 52/67] arch/x86/xen/suspend.c: include xen/xen.h Fix the build warning: arch/x86/xen/suspend.c: In function 'xen_arch_pre_suspend': arch/x86/xen/suspend.c:70:9: error: implicit declaration of function 'xen_pv_domain' [-Werror=implicit-function-declaration] if (xen_pv_domain()) ^ Reported-by: kbuild test robot Cc: Sasha Levin Cc: Konrad Rzeszutek Wilk Cc: Boris Ostrovsky Cc: David Vrabel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/suspend.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 3705eabd7e22..df0c40559583 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,6 +1,7 @@ #include #include +#include #include #include #include From 6122192eb6f2a3311bbf4600c5537fbe1c223022 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 29 Dec 2015 14:54:16 -0800 Subject: [PATCH 53/67] m32r: fix build failure m32r allmodconfig is failing with: In file included from ../include/linux/kvm_para.h:4:0, from ../kernel/watchdog.c:26: ../include/uapi/linux/kvm_para.h:30:26: fatal error: asm/kvm_para.h: No such file or directory kvm_para.h was not included in the build. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/include/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index fd104bd221ce..860e440611c9 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += clkdev.h generic-y += cputime.h generic-y += exec.h generic-y += irq_work.h +generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += module.h From 92a8ed4c7643809123ef0a65424569eaacc5c6b0 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 29 Dec 2015 14:54:19 -0800 Subject: [PATCH 54/67] m32r: add io*_rep helpers m32r allmodconfig was failing with the error: error: implicit declaration of function 'read' On checking io.h it turned out that 'read' is not defined but 'readb' is defined and 'ioread8' will then obviously mean 'readb'. At the same time some of the helper functions ioreadN_rep() and iowriteN_rep() were missing which also led to the build failure. Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/include/asm/io.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/m32r/include/asm/io.h b/arch/m32r/include/asm/io.h index 61b8931bc192..4b0f5e001d4d 100644 --- a/arch/m32r/include/asm/io.h +++ b/arch/m32r/include/asm/io.h @@ -168,13 +168,21 @@ static inline void _writel(unsigned long l, unsigned long addr) #define writew_relaxed writew #define writel_relaxed writel -#define ioread8 read +#define ioread8 readb #define ioread16 readw #define ioread32 readl #define iowrite8 writeb #define iowrite16 writew #define iowrite32 writel +#define ioread8_rep(p, dst, count) insb((unsigned long)(p), (dst), (count)) +#define ioread16_rep(p, dst, count) insw((unsigned long)(p), (dst), (count)) +#define ioread32_rep(p, dst, count) insl((unsigned long)(p), (dst), (count)) + +#define iowrite8_rep(p, src, count) outsb((unsigned long)(p), (src), (count)) +#define iowrite16_rep(p, src, count) outsw((unsigned long)(p), (src), (count)) +#define iowrite32_rep(p, src, count) outsl((unsigned long)(p), (src), (count)) + #define ioread16be(addr) be16_to_cpu(readw(addr)) #define ioread32be(addr) be32_to_cpu(readl(addr)) #define iowrite16be(v, addr) writew(cpu_to_be16(v), (addr)) From b5a8bc338e68d5f6f753e14ae59b30e75a5ffdde Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 29 Dec 2015 14:54:22 -0800 Subject: [PATCH 55/67] ocfs2: fix flock panic issue Commit 4f6563677ae8 ("Move locks API users to locks_lock_inode_wait()") move flock/posix lock indentify code to locks_lock_inode_wait(), but missed to set fl_flags to FL_FLOCK which caused the following kernel panic on 4.4.0_rc5. kernel BUG at fs/locks.c:1895! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2(O) ocfs2_dlmfs(O) ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront xen_netfront xen_fbfront xen_blkfront CPU: 0 PID: 20268 Comm: flock_unit_test Tainted: G O 4.4.0-rc5-next-20151217 #1 Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014 task: ffff88007b3672c0 ti: ffff880028b58000 task.ti: ffff880028b58000 RIP: locks_lock_inode_wait+0x2e/0x160 Call Trace: ocfs2_do_flock+0x91/0x160 [ocfs2] ocfs2_flock+0x76/0xd0 [ocfs2] SyS_flock+0x10f/0x1a0 entry_SYSCALL_64_fastpath+0x12/0x71 Code: e5 41 57 41 56 49 89 fe 41 55 41 54 53 48 89 f3 48 81 ec 88 00 00 00 8b 46 40 83 e0 03 83 f8 01 0f 84 ad 00 00 00 83 f8 02 74 04 <0f> 0b eb fe 4c 8d ad 60 ff ff ff 4c 8d 7b 58 e8 0e 8e 73 00 4d RIP locks_lock_inode_wait+0x2e/0x160 RSP ---[ end trace dfca74ec9b5b274c ]--- Fixes: 4f6563677ae8 ("Move locks API users to locks_lock_inode_wait()") Signed-off-by: Junxiao Bi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/locks.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 652ece4a9d9e..d56f0079b858 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -67,7 +67,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, */ locks_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + &(struct file_lock) { + .fl_type = F_UNLCK, + .fl_flags = FL_FLOCK + }); ocfs2_file_unlock(file); } From 5f0f2887f4de9508dcf438deab28f1de8070c271 Mon Sep 17 00:00:00 2001 From: Andrew Banman Date: Tue, 29 Dec 2015 14:54:25 -0800 Subject: [PATCH 56/67] mm/memory_hotplug.c: check for missing sections in test_pages_in_a_zone() test_pages_in_a_zone() does not account for the possibility of missing sections in the given pfn range. pfn_valid_within always returns 1 when CONFIG_HOLES_IN_ZONE is not set, allowing invalid pfns from missing sections to pass the test, leading to a kernel oops. Wrap an additional pfn loop with PAGES_PER_SECTION granularity to check for missing sections before proceeding into the zone-check code. This also prevents a crash from offlining memory devices with missing sections. Despite this, it may be a good idea to keep the related patch '[PATCH 3/3] drivers: memory: prohibit offlining of memory blocks with missing sections' because missing sections in a memory block may lead to other problems not covered by the scope of this fix. Signed-off-by: Andrew Banman Acked-by: Alex Thorlton Cc: Russ Anderson Cc: Alex Thorlton Cc: Yinghai Lu Cc: Greg KH Cc: Seth Jennings Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 67d488ab495e..a042a9d537bb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1375,23 +1375,30 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; - if (i == MAX_ORDER_NR_PAGES) + pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + /* Make sure the memory section is present first */ + if (!present_section_nr(pfn_to_section_nr(pfn))) continue; - page = pfn_to_page(pfn + i); - if (zone && page_zone(page) != zone) - return 0; - zone = page_zone(page); + for (; pfn < sec_end_pfn && pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && + !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } } return 1; } From cc28d6d80f6ab494b10f0e2ec949eacd610f66e3 Mon Sep 17 00:00:00 2001 From: xuejiufei Date: Tue, 29 Dec 2015 14:54:29 -0800 Subject: [PATCH 57/67] ocfs2/dlm: clear migration_pending when migration target goes down We have found a BUG on res->migration_pending when migrating lock resources. The situation is as follows. dlm_mark_lockres_migration res->migration_pending = 1; __dlm_lockres_reserve_ast dlm_lockres_release_ast returns with res->migration_pending remains because other threads reserve asts wait dlm_migration_can_proceed returns 1 >>>>>>> o2hb found that target goes down and remove target from domain_map dlm_migration_can_proceed returns 1 dlm_mark_lockres_migrating returns -ESHOTDOWN with res->migration_pending still remains. When reentering dlm_mark_lockres_migrating(), it will trigger the BUG_ON with res->migration_pending. So clear migration_pending when target is down. Signed-off-by: Jiufei Xue Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ce38b4ccc9ab..84f2f8079466 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2843,6 +2843,8 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; if (!ret) BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + else + res->migration_pending = 0; spin_unlock(&res->spinlock); /* From 6cdb18ad98a49f7e9b95d538a0614cde827404b8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 29 Dec 2015 14:54:32 -0800 Subject: [PATCH 58/67] mm/vmstat: fix overflow in mod_zone_page_state() mod_zone_page_state() takes a "delta" integer argument. delta contains the number of pages that should be added or subtracted from a struct zone's vm_stat field. If a zone is larger than 8TB this will cause overflows. E.g. for a zone with a size slightly larger than 8TB the line mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); in mm/page_alloc.c:free_area_init_core() will result in a negative result for the NR_ALLOC_BATCH entry within the zone's vm_stat, since 8TB contain 0x8xxxxxxx pages which will be sign extended to a negative value. Fix this by changing the delta argument to long type. This could fix an early boot problem seen on s390, where we have a 9TB system with only one node. ZONE_DMA contains 2GB and ZONE_NORMAL the rest. The system is trying to allocate a GFP_DMA page but ZONE_DMA is completely empty, so it tries to reclaim pages in an endless loop. This was seen on a heavily patched 3.10 kernel. One possible explaination seem to be the overflows caused by mod_zone_page_state(). Unfortunately I did not have the chance to verify that this patch actually fixes the problem, since I don't have access to the system right now. However the overflow problem does exist anyway. Given the description that a system with slightly less than 8TB does work, this seems to be a candidate for the observed problem. Signed-off-by: Heiko Carstens Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 6 +++--- mm/vmstat.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 5dbc8b0ee567..3e5d9075960f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -176,11 +176,11 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) #ifdef CONFIG_SMP -void __mod_zone_page_state(struct zone *, enum zone_stat_item item, int); +void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); -void mod_zone_page_state(struct zone *, enum zone_stat_item, int); +void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); @@ -205,7 +205,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, - enum zone_stat_item item, int delta) + enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 0d5712b0206c..4ebc17d948cb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -219,7 +219,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -318,8 +318,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, - enum zone_stat_item item, int delta, int overstep_mode) +static inline void mod_state(struct zone *zone, enum zone_stat_item item, + long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -357,7 +357,7 @@ static inline void mod_state(struct zone *zone, } void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { mod_state(zone, item, delta, 0); } @@ -384,7 +384,7 @@ EXPORT_SYMBOL(dec_zone_page_state); * Use interrupt disable to serialize counter updates */ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { unsigned long flags; From 3d8acd1f667b45c531401c8f0c2033072e32a05d Mon Sep 17 00:00:00 2001 From: Gary Wang Date: Wed, 23 Dec 2015 16:11:35 +0800 Subject: [PATCH 59/67] drm/i915: increase the tries for HDMI hotplug live status checking The total delay of HDMI hotplug detecting with 30ms is sometimes not enoughtfor HDMI live status up with specific HDMI monitors in BSW platform. After doing experiments for following monitors, it needs 80ms at least for those worst cases. Lenovo L246 1xwA (4 failed, necessary hot-plug delay: 58/40/60/40ms) Philips HH2AP (9 failed, necessary hot-plug delay: 80/50/50/60/46/40/58/58/39ms) BENQ ET-0035-N (6 failed, necessary hot-plug delay: 60/50/50/80/80/40ms) DELL U2713HM (2 failed, necessary hot-plug delay: 58/59ms) HP HP-LP2475w (5 failed, necessary hot-plug delay: 70/50/40/60/40ms) It looks like 70-80 ms is BSW platform needs in some bad cases of the monitors at this end (8 times delay at most). Keep less than 100ms for HDCP pulse HPD low (with at least 100ms) to respond a plug out. Reviewed-by: Cooper Chiou Tested-by: Gary Wang Cc: Gavin Hindman Cc: Sonika Jindal Cc: Shashank Sharma Cc: Shobhit Kumar Signed-off-by: Gary Wang Link: http://patchwork.freedesktop.org/patch/msgid/1450858295-12804-1-git-send-email-gary.c.wang@intel.com Tested-by: Shobhit Kumar Cc: drm-intel-fixes@lists.freedesktop.org Fixes: 237ed86c693d ("drm/i915: Check live status before reading edid") Signed-off-by: Daniel Vetter (cherry picked from commit f8d03ea0053b23de42c828d559016eabe0b91523) [Jani: undo the file mode change of the original commit] Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c index 64086f2d4e26..e6c035b0fc1c 100644 --- a/drivers/gpu/drm/i915/intel_hdmi.c +++ b/drivers/gpu/drm/i915/intel_hdmi.c @@ -1381,7 +1381,7 @@ intel_hdmi_detect(struct drm_connector *connector, bool force) intel_display_power_get(dev_priv, POWER_DOMAIN_GMBUS); - for (try = 0; !live_status && try < 4; try++) { + for (try = 0; !live_status && try < 9; try++) { if (try) msleep(10); live_status = intel_digital_port_connected(dev_priv, From 574aab1e02837927e3c94193eedf94128ad10b6d Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Tue, 29 Dec 2015 13:29:55 +0100 Subject: [PATCH 60/67] net, socket, socket_wq: fix missing initialization of flags Commit ceb5d58b2170 ("net: fix sock_wake_async() rcu protection") from the current 4.4 release cycle introduced a new flags member in struct socket_wq and moved SOCKWQ_ASYNC_NOSPACE and SOCKWQ_ASYNC_WAITDATA from struct socket's flags member into that new place. Unfortunately, the new flags field is never initialized properly, at least not for the struct socket_wq instance created in sock_alloc_inode(). One particular issue I encountered because of this is that my GNU Emacs failed to draw anything on my desktop -- i.e. what I got is a transparent window, including the title bar. Bisection lead to the commit mentioned above and further investigation by means of strace told me that Emacs is indeed speaking to my Xorg through an O_ASYNC AF_UNIX socket. This is reproducible 100% of times and the fact that properly initializing the struct socket_wq ->flags fixes the issue leads me to the conclusion that somehow SOCKWQ_ASYNC_WAITDATA got set in the uninitialized ->flags, preventing my Emacs from receiving any SIGIO's due to data becoming available and it got stuck. Make sock_alloc_inode() set the newly created struct socket_wq's ->flags member to zero. Fixes: ceb5d58b2170 ("net: fix sock_wake_async() rcu protection") Signed-off-by: Nicolai Stange Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/socket.c b/net/socket.c index 29822d6dd91e..d730ef9dfbf0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -257,6 +257,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb) } init_waitqueue_head(&wq->wait); wq->fasync_list = NULL; + wq->flags = 0; RCU_INIT_POINTER(ei->socket.wq, wq); ei->socket.state = SS_UNCONNECTED; From 068d8bd338e855286aea54e70d1c101569284b21 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 29 Dec 2015 17:49:25 +0800 Subject: [PATCH 61/67] sctp: sctp should release assoc when sctp_make_abort_user return NULL in sctp_close In sctp_close, sctp_make_abort_user may return NULL because of memory allocation failure. If this happens, it will bypass any state change and never free the assoc. The assoc has no chance to be freed and it will be kept in memory with the state it had even after the socket is closed by sctp_close(). So if sctp_make_abort_user fails to allocate memory, we should abort the asoc via sctp_primitive_ABORT as well. Just like the annotation in sctp_sf_cookie_wait_prm_abort and sctp_sf_do_9_1_prm_abort said, "Even if we can't send the ABORT due to low memory delete the TCB. This is a departure from our typical NOMEM handling". But then the chunk is NULL (low memory) and the SCTP_CMD_REPLY cmd would dereference the chunk pointer, and system crash. So we should add SCTP_CMD_REPLY cmd only when the chunk is not NULL, just like other places where it adds SCTP_CMD_REPLY cmd. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 6 ++++-- net/sctp/socket.c | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index cd34a4a34065..22c2bf367d7e 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4829,7 +4829,8 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort( retval = SCTP_DISPOSITION_CONSUME; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); /* Even if we can't send the ABORT due to low memory delete the * TCB. This is a departure from our typical NOMEM handling. @@ -4966,7 +4967,8 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort( SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); retval = SCTP_DISPOSITION_CONSUME; - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + if (abort) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 529ed357a2cf..ef1d90fdc773 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1516,8 +1516,7 @@ static void sctp_close(struct sock *sk, long timeout) struct sctp_chunk *chunk; chunk = sctp_make_abort_user(asoc, NULL, 0); - if (chunk) - sctp_primitive_ABORT(net, asoc, chunk); + sctp_primitive_ABORT(net, asoc, chunk); } else sctp_primitive_SHUTDOWN(net, asoc, NULL); } From 8b30ca73b7cc7f2177cfc4e8274d2ebdba328cd5 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 31 Dec 2015 15:18:02 -0500 Subject: [PATCH 62/67] sparc: Add all necessary direct socket system calls. The GLIBC folks would like to eliminate socketcall support eventually, and this makes sense regardless so wire them all up. Signed-off-by: David S. Miller --- arch/sparc/include/uapi/asm/unistd.h | 5 ++++- arch/sparc/kernel/systbls_32.S | 19 ++++++++++--------- arch/sparc/kernel/systbls_64.S | 18 ++++++++++-------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index f31a124a8497..5655912fbca7 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h @@ -418,8 +418,11 @@ #define __NR_execveat 350 #define __NR_membarrier 351 #define __NR_userfaultfd 352 +#define __NR_bind 353 +#define __NR_listen 354 +#define __NR_setsockopt 355 -#define NR_syscalls 353 +#define NR_syscalls 356 /* Bitmask values returned from kern_features system call. */ #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 78e80293cb6d..d557a256517c 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -35,18 +35,18 @@ sys_call_table: /*80*/ .long sys_setgroups16, sys_getpgrp, sys_setgroups, sys_setitimer, sys_ftruncate64 /*85*/ .long sys_swapon, sys_getitimer, sys_setuid, sys_sethostname, sys_setgid /*90*/ .long sys_dup2, sys_setfsuid, sys_fcntl, sys_select, sys_setfsgid -/*95*/ .long sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +/*95*/ .long sys_fsync, sys_setpriority, sys_socket, sys_connect, sys_accept /*100*/ .long sys_getpriority, sys_rt_sigreturn, sys_rt_sigaction, sys_rt_sigprocmask, sys_rt_sigpending /*105*/ .long sys_rt_sigtimedwait, sys_rt_sigqueueinfo, sys_rt_sigsuspend, sys_setresuid, sys_getresuid -/*110*/ .long sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall -/*115*/ .long sys_getgroups, sys_gettimeofday, sys_getrusage, sys_nis_syscall, sys_getcwd +/*110*/ .long sys_setresgid, sys_getresgid, sys_setregid, sys_recvmsg, sys_sendmsg +/*115*/ .long sys_getgroups, sys_gettimeofday, sys_getrusage, sys_getsockopt, sys_getcwd /*120*/ .long sys_readv, sys_writev, sys_settimeofday, sys_fchown16, sys_fchmod -/*125*/ .long sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate -/*130*/ .long sys_ftruncate, sys_flock, sys_lstat64, sys_nis_syscall, sys_nis_syscall -/*135*/ .long sys_nis_syscall, sys_mkdir, sys_rmdir, sys_utimes, sys_stat64 -/*140*/ .long sys_sendfile64, sys_nis_syscall, sys_futex, sys_gettid, sys_getrlimit +/*125*/ .long sys_recvfrom, sys_setreuid16, sys_setregid16, sys_rename, sys_truncate +/*130*/ .long sys_ftruncate, sys_flock, sys_lstat64, sys_sendto, sys_shutdown +/*135*/ .long sys_socketpair, sys_mkdir, sys_rmdir, sys_utimes, sys_stat64 +/*140*/ .long sys_sendfile64, sys_getpeername, sys_futex, sys_gettid, sys_getrlimit /*145*/ .long sys_setrlimit, sys_pivot_root, sys_prctl, sys_pciconfig_read, sys_pciconfig_write -/*150*/ .long sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64 +/*150*/ .long sys_getsockname, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64 /*155*/ .long sys_fcntl64, sys_inotify_rm_watch, sys_statfs, sys_fstatfs, sys_oldumount /*160*/ .long sys_sched_setaffinity, sys_sched_getaffinity, sys_getdomainname, sys_setdomainname, sys_nis_syscall /*165*/ .long sys_quotactl, sys_set_tid_address, sys_mount, sys_ustat, sys_setxattr @@ -87,4 +87,5 @@ sys_call_table: /*335*/ .long sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf -/*350*/ .long sys_execveat, sys_membarrier, sys_userfaultfd +/*350*/ .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen +/*355*/ .long sys_setsockopt diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 2549c2c3ec2f..898684c97031 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -37,15 +37,15 @@ sys_call_table32: /*80*/ .word sys_setgroups16, sys_getpgrp, sys_setgroups, compat_sys_setitimer, sys32_ftruncate64 .word sys_swapon, compat_sys_getitimer, sys_setuid, sys_sethostname, sys_setgid /*90*/ .word sys_dup2, sys_setfsuid, compat_sys_fcntl, sys32_select, sys_setfsgid - .word sys_fsync, sys_setpriority, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_fsync, sys_setpriority, sys_socket, sys_connect, sys_accept /*100*/ .word sys_getpriority, sys32_rt_sigreturn, compat_sys_rt_sigaction, compat_sys_rt_sigprocmask, compat_sys_rt_sigpending .word compat_sys_rt_sigtimedwait, compat_sys_rt_sigqueueinfo, compat_sys_rt_sigsuspend, sys_setresuid, sys_getresuid -/*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, sys_nis_syscall, sys_nis_syscall - .word sys_getgroups, compat_sys_gettimeofday, compat_sys_getrusage, sys_nis_syscall, sys_getcwd +/*110*/ .word sys_setresgid, sys_getresgid, sys_setregid, compat_sys_recvmsg, compat_sys_sendmsg + .word sys_getgroups, compat_sys_gettimeofday, compat_sys_getrusage, compat_sys_getsockopt, sys_getcwd /*120*/ .word compat_sys_readv, compat_sys_writev, compat_sys_settimeofday, sys_fchown16, sys_fchmod - .word sys_nis_syscall, sys_setreuid16, sys_setregid16, sys_rename, compat_sys_truncate -/*130*/ .word compat_sys_ftruncate, sys_flock, compat_sys_lstat64, sys_nis_syscall, sys_nis_syscall - .word sys_nis_syscall, sys_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64 + .word sys_recvfrom, sys_setreuid16, sys_setregid16, sys_rename, compat_sys_truncate +/*130*/ .word compat_sys_ftruncate, sys_flock, compat_sys_lstat64, sys_sendto, sys_shutdown + .word sys_socketpair, sys_mkdir, sys_rmdir, compat_sys_utimes, compat_sys_stat64 /*140*/ .word sys_sendfile64, sys_nis_syscall, sys32_futex, sys_gettid, compat_sys_getrlimit .word compat_sys_setrlimit, sys_pivot_root, sys_prctl, sys_pciconfig_read, sys_pciconfig_write /*150*/ .word sys_nis_syscall, sys_inotify_init, sys_inotify_add_watch, sys_poll, sys_getdents64 @@ -88,7 +88,8 @@ sys_call_table32: .word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf -/*350*/ .word sys32_execveat, sys_membarrier, sys_userfaultfd +/*350*/ .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen + .word compat_sys_setsockopt #endif /* CONFIG_COMPAT */ @@ -168,4 +169,5 @@ sys_call_table: .word sys_syncfs, sys_sendmmsg, sys_setns, sys_process_vm_readv, sys_process_vm_writev /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf -/*350*/ .word sys64_execveat, sys_membarrier, sys_userfaultfd +/*350*/ .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen + .word sys_setsockopt From 42d85c52f88dd0d2159f531eb33cc66d6e3e60c0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 31 Dec 2015 15:38:56 -0500 Subject: [PATCH 63/67] sparc: Wire up mlock2 system call. Signed-off-by: David S. Miller --- arch/sparc/include/uapi/asm/unistd.h | 3 ++- arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h index 5655912fbca7..1c26d440d288 100644 --- a/arch/sparc/include/uapi/asm/unistd.h +++ b/arch/sparc/include/uapi/asm/unistd.h @@ -421,8 +421,9 @@ #define __NR_bind 353 #define __NR_listen 354 #define __NR_setsockopt 355 +#define __NR_mlock2 356 -#define NR_syscalls 356 +#define NR_syscalls 357 /* Bitmask values returned from kern_features system call. */ #define KERN_FEATURE_MIXED_MODE_STACK 0x00000001 diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index d557a256517c..e663b6c78de2 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -88,4 +88,4 @@ sys_call_table: /*340*/ .long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr /*345*/ .long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf /*350*/ .long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen -/*355*/ .long sys_setsockopt +/*355*/ .long sys_setsockopt, sys_mlock2 diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 898684c97031..1557121f4cdc 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -89,7 +89,7 @@ sys_call_table32: /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf /*350*/ .word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen - .word compat_sys_setsockopt + .word compat_sys_setsockopt, sys_mlock2 #endif /* CONFIG_COMPAT */ @@ -170,4 +170,4 @@ sys_call_table: /*340*/ .word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr .word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf /*350*/ .word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen - .word sys_setsockopt + .word sys_setsockopt, sys_mlock2 From 168309855a7d1e16db751e9c647119fe2d2dc878 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jan 2016 15:15:37 -0800 Subject: [PATCH 64/67] Linux 4.4-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1122433a5cd5..9d94adeceab6 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 4 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Blurry Fish Butt # *DOCUMENTATION* From b43417216e9ce55e1f1ab7c834c7ab43db0b53e1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 5 Jan 2016 18:27:29 +0100 Subject: [PATCH 65/67] compat_ioctl: don't look up the fd twice In code in fs/compat_ioctl.c that translates ioctl arguments into a in-kernel structure, then performs sys_ioctl, possibly under set_fs(KERNEL_DS), this commit changes the sys_ioctl calls to do_ioctl calls. do_ioctl is a new function that does the same thing as sys_ioctl, but doesn't look up the fd again. This change is made to avoid (potential) security issues because of ioctl handlers that accept one of the ioctl commands I2C_FUNCS, VIDEO_GET_EVENT, MTIOCPOS, MTIOCGET, TIOCGSERIAL, TIOCSSERIAL, RTC_IRQP_READ, RTC_EPOCH_READ. This can happen for multiple reasons: - The ioctl command number could be reused. - The ioctl handler might not check the full ioctl command. This is e.g. true for drm_ioctl. - The ioctl handler is very special, e.g. cuse_file_ioctl The real issue is that set_fs(KERNEL_DS) is used here, but that's fixed in a separate commit "compat_ioctl: don't call do_ioctl under set_fs(KERNEL_DS)". This change mitigates potential security issues by preventing a race that permits invocation of unlocked_ioctl handlers under KERNEL_DS through compat code even if a corresponding compat_ioctl handler exists. So far, no way has been identified to use this to damage kernel memory without having CAP_SYS_ADMIN in the init ns (with the capability, doing reads/writes at arbitrary kernel addresses should be easy through CUSE's ioctl handler with FUSE_IOCTL_UNRESTRICTED set). [AV: two missed sys_ioctl() taken care of] Signed-off-by: Jann Horn Signed-off-by: Al Viro --- fs/compat_ioctl.c | 122 ++++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dcf26537c935..06e60cab0c3b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -115,15 +115,27 @@ #include #endif -static int w_long(unsigned int fd, unsigned int cmd, - compat_ulong_t __user *argp) +static int do_ioctl(struct file *file, unsigned int fd, + unsigned int cmd, unsigned long arg) +{ + int err; + + err = security_file_ioctl(file, cmd, arg); + if (err) + return err; + + return do_vfs_ioctl(file, fd, cmd, arg); +} + +static int w_long(struct file *file, unsigned int fd, + unsigned int cmd, compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); int err; unsigned long val; set_fs (KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&val); + err = do_ioctl(file, fd, cmd, (unsigned long)&val); set_fs (old_fs); if (!err && put_user(val, argp)) return -EFAULT; @@ -139,15 +151,15 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, - struct compat_video_event __user *up) +static int do_video_get_event(struct file *file, unsigned int fd, + unsigned int cmd, struct compat_video_event __user *up) { struct video_event kevent; mm_segment_t old_fs = get_fs(); int err; set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long) &kevent); + err = do_ioctl(file, fd, cmd, (unsigned long) &kevent); set_fs(old_fs); if (!err) { @@ -169,8 +181,8 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, - struct compat_video_still_picture __user *up) +static int do_video_stillpicture(struct file *file, unsigned int fd, + unsigned int cmd, struct compat_video_still_picture __user *up) { struct video_still_picture __user *up_native; compat_uptr_t fp; @@ -190,7 +202,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, fd, cmd, (unsigned long) up_native); return err; } @@ -200,8 +212,8 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, - struct compat_video_spu_palette __user *up) +static int do_video_set_spu_palette(struct file *file, unsigned int fd, + unsigned int cmd, struct compat_video_spu_palette __user *up) { struct video_spu_palette __user *up_native; compat_uptr_t palp; @@ -218,7 +230,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, fd, cmd, (unsigned long) up_native); return err; } @@ -276,7 +288,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, +static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd, sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; @@ -289,7 +301,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (get_user(interface_id, &sgio32->interface_id)) return -EFAULT; if (interface_id != 'S') - return sys_ioctl(fd, cmd, (unsigned long)sgio32); + return do_ioctl(file, fd, cmd, (unsigned long)sgio32); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -349,7 +361,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) sgio); + err = do_ioctl(file, fd, cmd, (unsigned long) sgio); if (err >= 0) { void __user *datap; @@ -380,13 +392,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct - compat_sg_req_info __user *o) +static int sg_grt_trans(struct file *file, unsigned int fd, + unsigned int cmd, struct compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = sys_ioctl(fd,cmd,(unsigned long)r); + err = do_ioctl(file, fd, cmd, (unsigned long)r); if (err < 0) return err; for (i = 0; i < SG_MAX_QUEUE; i++) { @@ -412,8 +424,8 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, - struct sock_fprog32 __user *u_fprog32) +static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd, + unsigned int cmd, struct sock_fprog32 __user *u_fprog32) { struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; @@ -435,7 +447,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, else cmd = PPPIOCSACTIVE; - return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); + return do_ioctl(file, fd, cmd, (unsigned long) u_fprog64); } struct ppp_option_data32 { @@ -451,7 +463,7 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, +static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd, struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; @@ -460,7 +472,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, idle = compat_alloc_user_space(sizeof(*idle)); - err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); + err = do_ioctl(file, fd, PPPIOCGIDLE, (unsigned long) idle); if (!err) { if (get_user(xmit, &idle->xmit_idle) || @@ -472,7 +484,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, +static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd, struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; @@ -492,7 +504,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, sizeof(__u32) + sizeof(int))) return -EFAULT; - return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); + return do_ioctl(file, fd, PPPIOCSCOMPRESS, (unsigned long) odata); } #ifdef CONFIG_BLOCK @@ -512,7 +524,8 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) +static int mt_ioctl_trans(struct file *file, unsigned int fd, + unsigned int cmd, void __user *argp) { mm_segment_t old_fs = get_fs(); struct mtget get; @@ -534,7 +547,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) break; } set_fs (KERNEL_DS); - err = sys_ioctl (fd, kcmd, (unsigned long)karg); + err = do_ioctl(file, fd, kcmd, (unsigned long)karg); set_fs (old_fs); if (err) return err; @@ -605,8 +618,8 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, - struct serial_struct32 __user *ss32) +static int serial_struct_ioctl(struct file *file, unsigned fd, + unsigned cmd, struct serial_struct32 __user *ss32) { typedef struct serial_struct32 SS32; int err; @@ -629,7 +642,7 @@ static int serial_struct_ioctl(unsigned fd, unsigned cmd, ss.iomap_base = 0UL; } set_fs(KERNEL_DS); - err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); + err = do_ioctl(file, fd, cmd, (unsigned long)&ss); set_fs(oldseg); if (cmd == TIOCGSERIAL && err >= 0) { if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) @@ -674,8 +687,8 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_rdwr_ioctl_data32 __user *udata) +static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd, + unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) { struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; @@ -708,11 +721,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, put_user(compat_ptr(datap), &tmsgs[i].buf)) return -EFAULT; } - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, fd, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_smbus_ioctl_data32 __user *udata) +static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd, + unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; compat_caddr_t datap; @@ -734,7 +747,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, __put_user(compat_ptr(datap), &tdata->data)) return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, fd, cmd, (unsigned long)tdata); } #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) @@ -742,7 +755,8 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) +static int rtc_ioctl(struct file *file, unsigned fd, + unsigned cmd, void __user *argp) { mm_segment_t oldfs = get_fs(); compat_ulong_t val32; @@ -753,7 +767,7 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) case RTC_IRQP_READ32: case RTC_EPOCH_READ32: set_fs(KERNEL_DS); - ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + ret = do_ioctl(file, fd, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, (unsigned long)&kval); set_fs(oldfs); @@ -762,9 +776,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) val32 = kval; return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); + return do_ioctl(file, fd, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); + return do_ioctl(file, fd, RTC_EPOCH_SET, (unsigned long)argp); } return -ENOIOCTLCMD; @@ -1443,46 +1457,46 @@ static long do_ioctl_trans(int fd, unsigned int cmd, switch (cmd) { case PPPIOCGIDLE32: - return ppp_gidle(fd, cmd, argp); + return ppp_gidle(file, fd, cmd, argp); case PPPIOCSCOMPRESS32: - return ppp_scompress(fd, cmd, argp); + return ppp_scompress(file, fd, cmd, argp); case PPPIOCSPASS32: case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); + return ppp_sock_fprog_ioctl_trans(file, fd, cmd, argp); #ifdef CONFIG_BLOCK case SG_IO: - return sg_ioctl_trans(fd, cmd, argp); + return sg_ioctl_trans(file, fd, cmd, argp); case SG_GET_REQUEST_TABLE: - return sg_grt_trans(fd, cmd, argp); + return sg_grt_trans(file, fd, cmd, argp); case MTIOCGET32: case MTIOCPOS32: - return mt_ioctl_trans(fd, cmd, argp); + return mt_ioctl_trans(file, fd, cmd, argp); #endif /* Serial */ case TIOCGSERIAL: case TIOCSSERIAL: - return serial_struct_ioctl(fd, cmd, argp); + return serial_struct_ioctl(file, fd, cmd, argp); /* i2c */ case I2C_FUNCS: - return w_long(fd, cmd, argp); + return w_long(file, fd, cmd, argp); case I2C_RDWR: - return do_i2c_rdwr_ioctl(fd, cmd, argp); + return do_i2c_rdwr_ioctl(file, fd, cmd, argp); case I2C_SMBUS: - return do_i2c_smbus_ioctl(fd, cmd, argp); + return do_i2c_smbus_ioctl(file, fd, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: - return rtc_ioctl(fd, cmd, argp); + return rtc_ioctl(file, fd, cmd, argp); /* dvb */ case VIDEO_GET_EVENT: - return do_video_get_event(fd, cmd, argp); + return do_video_get_event(file, fd, cmd, argp); case VIDEO_STILLPICTURE: - return do_video_stillpicture(fd, cmd, argp); + return do_video_stillpicture(file, fd, cmd, argp); case VIDEO_SET_SPU_PALETTE: - return do_video_set_spu_palette(fd, cmd, argp); + return do_video_set_spu_palette(file, fd, cmd, argp); } /* From 66cf191f3eae4582a83cb4251b75b43bee95a999 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Jan 2016 09:53:30 -0500 Subject: [PATCH 66/67] compat_ioctl: don't pass fd around when not needed Signed-off-by: Al Viro --- fs/compat_ioctl.c | 103 +++++++++++++++++++++++---------------------- fs/internal.h | 7 +++ fs/ioctl.c | 4 +- include/linux/fs.h | 2 - 4 files changed, 61 insertions(+), 55 deletions(-) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 06e60cab0c3b..908837cd2ac7 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -58,6 +58,8 @@ #include #include +#include "internal.h" + #include #include #include @@ -115,8 +117,7 @@ #include #endif -static int do_ioctl(struct file *file, unsigned int fd, - unsigned int cmd, unsigned long arg) +static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int err; @@ -124,10 +125,10 @@ static int do_ioctl(struct file *file, unsigned int fd, if (err) return err; - return do_vfs_ioctl(file, fd, cmd, arg); + return vfs_ioctl(file, cmd, arg); } -static int w_long(struct file *file, unsigned int fd, +static int w_long(struct file *file, unsigned int cmd, compat_ulong_t __user *argp) { mm_segment_t old_fs = get_fs(); @@ -135,7 +136,7 @@ static int w_long(struct file *file, unsigned int fd, unsigned long val; set_fs (KERNEL_DS); - err = do_ioctl(file, fd, cmd, (unsigned long)&val); + err = do_ioctl(file, cmd, (unsigned long)&val); set_fs (old_fs); if (!err && put_user(val, argp)) return -EFAULT; @@ -151,7 +152,7 @@ struct compat_video_event { } u; }; -static int do_video_get_event(struct file *file, unsigned int fd, +static int do_video_get_event(struct file *file, unsigned int cmd, struct compat_video_event __user *up) { struct video_event kevent; @@ -159,7 +160,7 @@ static int do_video_get_event(struct file *file, unsigned int fd, int err; set_fs(KERNEL_DS); - err = do_ioctl(file, fd, cmd, (unsigned long) &kevent); + err = do_ioctl(file, cmd, (unsigned long) &kevent); set_fs(old_fs); if (!err) { @@ -181,7 +182,7 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(struct file *file, unsigned int fd, +static int do_video_stillpicture(struct file *file, unsigned int cmd, struct compat_video_still_picture __user *up) { struct video_still_picture __user *up_native; @@ -202,7 +203,7 @@ static int do_video_stillpicture(struct file *file, unsigned int fd, if (err) return -EFAULT; - err = do_ioctl(file, fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -212,7 +213,7 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(struct file *file, unsigned int fd, +static int do_video_set_spu_palette(struct file *file, unsigned int cmd, struct compat_video_spu_palette __user *up) { struct video_spu_palette __user *up_native; @@ -230,7 +231,7 @@ static int do_video_set_spu_palette(struct file *file, unsigned int fd, if (err) return -EFAULT; - err = do_ioctl(file, fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -288,7 +289,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd, +static int sg_ioctl_trans(struct file *file, unsigned int cmd, sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; @@ -301,7 +302,7 @@ static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd, if (get_user(interface_id, &sgio32->interface_id)) return -EFAULT; if (interface_id != 'S') - return do_ioctl(file, fd, cmd, (unsigned long)sgio32); + return do_ioctl(file, cmd, (unsigned long)sgio32); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -361,7 +362,7 @@ static int sg_ioctl_trans(struct file *file, unsigned int fd, unsigned int cmd, if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - err = do_ioctl(file, fd, cmd, (unsigned long) sgio); + err = do_ioctl(file, cmd, (unsigned long) sgio); if (err >= 0) { void __user *datap; @@ -392,13 +393,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(struct file *file, unsigned int fd, +static int sg_grt_trans(struct file *file, unsigned int cmd, struct compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = do_ioctl(file, fd, cmd, (unsigned long)r); + err = do_ioctl(file, cmd, (unsigned long)r); if (err < 0) return err; for (i = 0; i < SG_MAX_QUEUE; i++) { @@ -424,7 +425,7 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd, +static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int cmd, struct sock_fprog32 __user *u_fprog32) { struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); @@ -447,7 +448,7 @@ static int ppp_sock_fprog_ioctl_trans(struct file *file, unsigned int fd, else cmd = PPPIOCSACTIVE; - return do_ioctl(file, fd, cmd, (unsigned long) u_fprog64); + return do_ioctl(file, cmd, (unsigned long) u_fprog64); } struct ppp_option_data32 { @@ -463,7 +464,7 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd, +static int ppp_gidle(struct file *file, unsigned int cmd, struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; @@ -472,7 +473,7 @@ static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd, idle = compat_alloc_user_space(sizeof(*idle)); - err = do_ioctl(file, fd, PPPIOCGIDLE, (unsigned long) idle); + err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle); if (!err) { if (get_user(xmit, &idle->xmit_idle) || @@ -484,7 +485,7 @@ static int ppp_gidle(struct file *file, unsigned int fd, unsigned int cmd, return err; } -static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd, +static int ppp_scompress(struct file *file, unsigned int cmd, struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; @@ -504,7 +505,7 @@ static int ppp_scompress(struct file *file, unsigned int fd, unsigned int cmd, sizeof(__u32) + sizeof(int))) return -EFAULT; - return do_ioctl(file, fd, PPPIOCSCOMPRESS, (unsigned long) odata); + return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata); } #ifdef CONFIG_BLOCK @@ -524,7 +525,7 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(struct file *file, unsigned int fd, +static int mt_ioctl_trans(struct file *file, unsigned int cmd, void __user *argp) { mm_segment_t old_fs = get_fs(); @@ -547,7 +548,7 @@ static int mt_ioctl_trans(struct file *file, unsigned int fd, break; } set_fs (KERNEL_DS); - err = do_ioctl(file, fd, kcmd, (unsigned long)karg); + err = do_ioctl(file, kcmd, (unsigned long)karg); set_fs (old_fs); if (err) return err; @@ -618,7 +619,7 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(struct file *file, unsigned fd, +static int serial_struct_ioctl(struct file *file, unsigned cmd, struct serial_struct32 __user *ss32) { typedef struct serial_struct32 SS32; @@ -642,7 +643,7 @@ static int serial_struct_ioctl(struct file *file, unsigned fd, ss.iomap_base = 0UL; } set_fs(KERNEL_DS); - err = do_ioctl(file, fd, cmd, (unsigned long)&ss); + err = do_ioctl(file, cmd, (unsigned long)&ss); set_fs(oldseg); if (cmd == TIOCGSERIAL && err >= 0) { if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) @@ -687,7 +688,7 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd, +static int do_i2c_rdwr_ioctl(struct file *file, unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) { struct i2c_rdwr_aligned __user *tdata; @@ -721,10 +722,10 @@ static int do_i2c_rdwr_ioctl(struct file *file, unsigned int fd, put_user(compat_ptr(datap), &tmsgs[i].buf)) return -EFAULT; } - return do_ioctl(file, fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd, +static int do_i2c_smbus_ioctl(struct file *file, unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; @@ -747,7 +748,7 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd, __put_user(compat_ptr(datap), &tdata->data)) return -EFAULT; - return do_ioctl(file, fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) @@ -755,7 +756,7 @@ static int do_i2c_smbus_ioctl(struct file *file, unsigned int fd, #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(struct file *file, unsigned fd, +static int rtc_ioctl(struct file *file, unsigned cmd, void __user *argp) { mm_segment_t oldfs = get_fs(); @@ -767,7 +768,7 @@ static int rtc_ioctl(struct file *file, unsigned fd, case RTC_IRQP_READ32: case RTC_EPOCH_READ32: set_fs(KERNEL_DS); - ret = do_ioctl(file, fd, (cmd == RTC_IRQP_READ32) ? + ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, (unsigned long)&kval); set_fs(oldfs); @@ -776,9 +777,9 @@ static int rtc_ioctl(struct file *file, unsigned fd, val32 = kval; return put_user(val32, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return do_ioctl(file, fd, RTC_IRQP_SET, (unsigned long)argp); + return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return do_ioctl(file, fd, RTC_EPOCH_SET, (unsigned long)argp); + return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp); } return -ENOIOCTLCMD; @@ -1450,53 +1451,53 @@ IGNORE_IOCTL(FBIOGCURSOR32) * a compat_ioctl operation in the place that handleѕ the * ioctl for the native case. */ -static long do_ioctl_trans(int fd, unsigned int cmd, +static long do_ioctl_trans(unsigned int cmd, unsigned long arg, struct file *file) { void __user *argp = compat_ptr(arg); switch (cmd) { case PPPIOCGIDLE32: - return ppp_gidle(file, fd, cmd, argp); + return ppp_gidle(file, cmd, argp); case PPPIOCSCOMPRESS32: - return ppp_scompress(file, fd, cmd, argp); + return ppp_scompress(file, cmd, argp); case PPPIOCSPASS32: case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(file, fd, cmd, argp); + return ppp_sock_fprog_ioctl_trans(file, cmd, argp); #ifdef CONFIG_BLOCK case SG_IO: - return sg_ioctl_trans(file, fd, cmd, argp); + return sg_ioctl_trans(file, cmd, argp); case SG_GET_REQUEST_TABLE: - return sg_grt_trans(file, fd, cmd, argp); + return sg_grt_trans(file, cmd, argp); case MTIOCGET32: case MTIOCPOS32: - return mt_ioctl_trans(file, fd, cmd, argp); + return mt_ioctl_trans(file, cmd, argp); #endif /* Serial */ case TIOCGSERIAL: case TIOCSSERIAL: - return serial_struct_ioctl(file, fd, cmd, argp); + return serial_struct_ioctl(file, cmd, argp); /* i2c */ case I2C_FUNCS: - return w_long(file, fd, cmd, argp); + return w_long(file, cmd, argp); case I2C_RDWR: - return do_i2c_rdwr_ioctl(file, fd, cmd, argp); + return do_i2c_rdwr_ioctl(file, cmd, argp); case I2C_SMBUS: - return do_i2c_smbus_ioctl(file, fd, cmd, argp); + return do_i2c_smbus_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: - return rtc_ioctl(file, fd, cmd, argp); + return rtc_ioctl(file, cmd, argp); /* dvb */ case VIDEO_GET_EVENT: - return do_video_get_event(file, fd, cmd, argp); + return do_video_get_event(file, cmd, argp); case VIDEO_STILLPICTURE: - return do_video_stillpicture(file, fd, cmd, argp); + return do_video_stillpicture(file, cmd, argp); case VIDEO_SET_SPU_PALETTE: - return do_video_set_spu_palette(file, fd, cmd, argp); + return do_video_set_spu_palette(file, cmd, argp); } /* @@ -1527,7 +1528,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case NBD_SET_BLKSIZE: case NBD_SET_SIZE: case NBD_SET_SIZE_BLOCKS: - return do_vfs_ioctl(file, fd, cmd, arg); + return vfs_ioctl(file, cmd, arg); } return -ENOIOCTLCMD; @@ -1616,7 +1617,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (compat_ioctl_check_table(XFORM(cmd))) goto found_handler; - error = do_ioctl_trans(fd, cmd, arg, f.file); + error = do_ioctl_trans(cmd, arg, f.file); if (error == -ENOIOCTLCMD) error = -ENOTTY; diff --git a/fs/internal.h b/fs/internal.h index 71859c4d0b41..e38c08ca437d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m); * fs/nsfs.c */ extern struct dentry_operations ns_dentry_operations; + +/* + * fs/ioctl.c + */ +extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, + unsigned long arg); +extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5d01d2638ca5..41c352e81193 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -15,6 +15,7 @@ #include #include #include +#include "internal.h" #include @@ -32,8 +33,7 @@ * * Returns 0 on success, -errno on error. */ -static long vfs_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa514254161..51f9f8d93d4d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2764,8 +2764,6 @@ extern int vfs_lstat(const char __user *, struct kstat *); extern int vfs_fstat(unsigned int, struct kstat *); extern int vfs_fstatat(int , const char __user *, struct kstat *, int); -extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, - unsigned long arg); extern int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, From a7f61e89af73e9bf760826b20dba4e637221fcb9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 5 Jan 2016 18:27:30 +0100 Subject: [PATCH 67/67] compat_ioctl: don't call do_ioctl under set_fs(KERNEL_DS) This replaces all code in fs/compat_ioctl.c that translated ioctl arguments into a in-kernel structure, then performed do_ioctl under set_fs(KERNEL_DS), with code that allocates data on the user stack and can call the VFS ioctl handler under USER_DS. This is done as a hardening measure because the caller does not know what kind of ioctl handler will be invoked, only that no corresponding compat_ioctl handler exists and what the ioctl command number is. The accidental invocation of an unlocked_ioctl handler that unexpectedly calls copy_to_user could be a severe security issue. Signed-off-by: Jann Horn Signed-off-by: Al Viro --- fs/compat_ioctl.c | 130 ++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 62 deletions(-) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 908837cd2ac7..9144b779d10e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -117,6 +117,13 @@ #include #endif +#define convert_in_user(srcptr, dstptr) \ +({ \ + typeof(*srcptr) val; \ + \ + get_user(val, srcptr) || put_user(val, dstptr); \ +}) + static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int err; @@ -131,16 +138,17 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static int w_long(struct file *file, unsigned int cmd, compat_ulong_t __user *argp) { - mm_segment_t old_fs = get_fs(); int err; - unsigned long val; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); - set_fs (KERNEL_DS); - err = do_ioctl(file, cmd, (unsigned long)&val); - set_fs (old_fs); - if (!err && put_user(val, argp)) + if (valp == NULL) return -EFAULT; - return err; + err = do_ioctl(file, cmd, (unsigned long)valp); + if (err) + return err; + if (convert_in_user(valp, argp)) + return -EFAULT; + return 0; } struct compat_video_event { @@ -155,20 +163,20 @@ struct compat_video_event { static int do_video_get_event(struct file *file, unsigned int cmd, struct compat_video_event __user *up) { - struct video_event kevent; - mm_segment_t old_fs = get_fs(); + struct video_event __user *kevent = + compat_alloc_user_space(sizeof(*kevent)); int err; - set_fs(KERNEL_DS); - err = do_ioctl(file, cmd, (unsigned long) &kevent); - set_fs(old_fs); + if (kevent == NULL) + return -EFAULT; + err = do_ioctl(file, cmd, (unsigned long)kevent); if (!err) { - err = put_user(kevent.type, &up->type); - err |= put_user(kevent.timestamp, &up->timestamp); - err |= put_user(kevent.u.size.w, &up->u.size.w); - err |= put_user(kevent.u.size.h, &up->u.size.h); - err |= put_user(kevent.u.size.aspect_ratio, + err = convert_in_user(&kevent->type, &up->type); + err |= convert_in_user(&kevent->timestamp, &up->timestamp); + err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); + err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); + err |= convert_in_user(&kevent->u.size.aspect_ratio, &up->u.size.aspect_ratio); if (err) err = -EFAULT; @@ -528,10 +536,10 @@ struct mtpos32 { static int mt_ioctl_trans(struct file *file, unsigned int cmd, void __user *argp) { - mm_segment_t old_fs = get_fs(); - struct mtget get; + /* NULL initialization to make gcc shut up */ + struct mtget __user *get = NULL; struct mtget32 __user *umget32; - struct mtpos pos; + struct mtpos __user *pos = NULL; struct mtpos32 __user *upos32; unsigned long kcmd; void *karg; @@ -540,32 +548,34 @@ static int mt_ioctl_trans(struct file *file, switch(cmd) { case MTIOCPOS32: kcmd = MTIOCPOS; - karg = &pos; + pos = compat_alloc_user_space(sizeof(*pos)); + karg = pos; break; default: /* MTIOCGET32 */ kcmd = MTIOCGET; - karg = &get; + get = compat_alloc_user_space(sizeof(*get)); + karg = get; break; } - set_fs (KERNEL_DS); + if (karg == NULL) + return -EFAULT; err = do_ioctl(file, kcmd, (unsigned long)karg); - set_fs (old_fs); if (err) return err; switch (cmd) { case MTIOCPOS32: upos32 = argp; - err = __put_user(pos.mt_blkno, &upos32->mt_blkno); + err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: umget32 = argp; - err = __put_user(get.mt_type, &umget32->mt_type); - err |= __put_user(get.mt_resid, &umget32->mt_resid); - err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); - err |= __put_user(get.mt_gstat, &umget32->mt_gstat); - err |= __put_user(get.mt_erreg, &umget32->mt_erreg); - err |= __put_user(get.mt_fileno, &umget32->mt_fileno); - err |= __put_user(get.mt_blkno, &umget32->mt_blkno); + err = convert_in_user(&get->mt_type, &umget32->mt_type); + err |= convert_in_user(&get->mt_resid, &umget32->mt_resid); + err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg); + err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat); + err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg); + err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno); + err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno); break; } return err ? -EFAULT: 0; @@ -624,37 +634,36 @@ static int serial_struct_ioctl(struct file *file, { typedef struct serial_struct32 SS32; int err; - struct serial_struct ss; - mm_segment_t oldseg = get_fs(); + struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss)); __u32 udata; unsigned int base; + unsigned char *iomem_base; + if (ss == NULL) + return -EFAULT; if (cmd == TIOCSSERIAL) { - if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) + if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) || + get_user(udata, &ss32->iomem_base)) return -EFAULT; - if (__get_user(udata, &ss32->iomem_base)) + iomem_base = compat_ptr(udata); + if (put_user(iomem_base, &ss->iomem_base) || + convert_in_user(&ss32->iomem_reg_shift, + &ss->iomem_reg_shift) || + convert_in_user(&ss32->port_high, &ss->port_high) || + put_user(0UL, &ss->iomap_base)) return -EFAULT; - ss.iomem_base = compat_ptr(udata); - if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __get_user(ss.port_high, &ss32->port_high)) - return -EFAULT; - ss.iomap_base = 0UL; } - set_fs(KERNEL_DS); - err = do_ioctl(file, cmd, (unsigned long)&ss); - set_fs(oldseg); + err = do_ioctl(file, cmd, (unsigned long)ss); if (cmd == TIOCGSERIAL && err >= 0) { - if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) + if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) || + get_user(iomem_base, &ss->iomem_base)) return -EFAULT; - base = (unsigned long)ss.iomem_base >> 32 ? - 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; - if (__put_user(base, &ss32->iomem_base) || - __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __put_user(ss.port_high, &ss32->port_high)) + base = (unsigned long)iomem_base >> 32 ? + 0xffffffff : (unsigned)(unsigned long)iomem_base; + if (put_user(base, &ss32->iomem_base) || + convert_in_user(&ss->iomem_reg_shift, + &ss32->iomem_reg_shift) || + convert_in_user(&ss->port_high, &ss32->port_high)) return -EFAULT; } return err; @@ -759,23 +768,20 @@ static int do_i2c_smbus_ioctl(struct file *file, static int rtc_ioctl(struct file *file, unsigned cmd, void __user *argp) { - mm_segment_t oldfs = get_fs(); - compat_ulong_t val32; - unsigned long kval; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); int ret; + if (valp == NULL) + return -EFAULT; switch (cmd) { case RTC_IRQP_READ32: case RTC_EPOCH_READ32: - set_fs(KERNEL_DS); ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, - (unsigned long)&kval); - set_fs(oldfs); + (unsigned long)valp); if (ret) return ret; - val32 = kval; - return put_user(val32, (unsigned int __user *)argp); + return convert_in_user(valp, (unsigned int __user *)argp); case RTC_IRQP_SET32: return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: