From 4ab2bb3c311a45d80d31f2f189606871669ed792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Filipe=20La=C3=ADns?= <lains@archlinux.org>
Date: Sat, 11 Jan 2020 19:24:19 +0000
Subject: [PATCH 001/239] HID: logitech-hidpp: BatteryVoltage: only read
 chargeStatus if extPower is active
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the HID++ 2.0 function getBatteryInfo() from the BatteryVoltage
(0x1001) feature, chargeStatus is only valid if extPower is active.

Previously we were ignoring extPower, which resulted in wrong values.

Example:
    With an unplugged mouse

    $ cat /sys/class/power_supply/hidpp_battery_0/status
    Charging

This patch fixes that, it also renames charge_sts to flags as
charge_sts can be confused with chargeStatus from the spec.

Spec:
+--------+-------------------------------------------------------------------------+
|  byte  |                                    2                                    |
+--------+--------------+------------+------------+----------+----------+----------+
|   bit  |     0..2     |      3     |      4     |     5    |     6    |     7    |
+--------+--------------+------------+------------+----------+----------+----------+
| buffer | chargeStatus | fastCharge | slowCharge | critical | (unused) | extPower |
+--------+--------------+------------+------------+----------+----------+----------+
Table 1 - battery voltage (0x1001), getBatteryInfo() (ASE 0), 3rd byte

+-------+--------------------------------------+
| value |                meaning               |
+-------+--------------------------------------+
|   0   | Charging                             |
+-------+--------------------------------------+
|   1   | End of charge (100% charged)         |
+-------+--------------------------------------+
|   2   | Charge stopped (any "normal" reason) |
+-------+--------------------------------------+
|   7   | Hardware error                       |
+-------+--------------------------------------+
Table 2 - chargeStatus value

Signed-off-by: Filipe Laíns <lains@archlinux.org>
Tested-by: Pedro Vanzella <pedro@pedrovanzella.com>
Reviewed-by: Pedro Vanzella <pedro@pedrovanzella.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-logitech-hidpp.c | 43 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
index 70e1cb928bf0..094f4f1b6555 100644
--- a/drivers/hid/hid-logitech-hidpp.c
+++ b/drivers/hid/hid-logitech-hidpp.c
@@ -1256,36 +1256,35 @@ static int hidpp20_battery_map_status_voltage(u8 data[3], int *voltage,
 {
 	int status;
 
-	long charge_sts = (long)data[2];
+	long flags = (long) data[2];
 
-	*level = POWER_SUPPLY_CAPACITY_LEVEL_UNKNOWN;
-	switch (data[2] & 0xe0) {
-	case 0x00:
-		status = POWER_SUPPLY_STATUS_CHARGING;
-		break;
-	case 0x20:
-		status = POWER_SUPPLY_STATUS_FULL;
-		*level = POWER_SUPPLY_CAPACITY_LEVEL_FULL;
-		break;
-	case 0x40:
+	if (flags & 0x80)
+		switch (flags & 0x07) {
+		case 0:
+			status = POWER_SUPPLY_STATUS_CHARGING;
+			break;
+		case 1:
+			status = POWER_SUPPLY_STATUS_FULL;
+			*level = POWER_SUPPLY_CAPACITY_LEVEL_FULL;
+			break;
+		case 2:
+			status = POWER_SUPPLY_STATUS_NOT_CHARGING;
+			break;
+		default:
+			status = POWER_SUPPLY_STATUS_UNKNOWN;
+			break;
+		}
+	else
 		status = POWER_SUPPLY_STATUS_DISCHARGING;
-		break;
-	case 0xe0:
-		status = POWER_SUPPLY_STATUS_NOT_CHARGING;
-		break;
-	default:
-		status = POWER_SUPPLY_STATUS_UNKNOWN;
-	}
 
 	*charge_type = POWER_SUPPLY_CHARGE_TYPE_STANDARD;
-	if (test_bit(3, &charge_sts)) {
+	if (test_bit(3, &flags)) {
 		*charge_type = POWER_SUPPLY_CHARGE_TYPE_FAST;
 	}
-	if (test_bit(4, &charge_sts)) {
+	if (test_bit(4, &flags)) {
 		*charge_type = POWER_SUPPLY_CHARGE_TYPE_TRICKLE;
 	}
-
-	if (test_bit(5, &charge_sts)) {
+	if (test_bit(5, &flags)) {
 		*level = POWER_SUPPLY_CAPACITY_LEVEL_CRITICAL;
 	}
 

From beae56192a2570578ae45050e73c5ff9254f63e6 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sat, 1 Feb 2020 12:56:48 +0100
Subject: [PATCH 002/239] HID: ite: Only bind to keyboard USB interface on Acer
 SW5-012 keyboard dock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8f18eca9ebc5 ("HID: ite: Add USB id match for Acer SW5-012 keyboard
dock") added the USB id for the Acer SW5-012's keyboard dock to the
hid-ite driver to fix the rfkill driver not working.

Most keyboard docks with an ITE 8595 keyboard/touchpad controller have the
"Wireless Radio Control" bits which need the special hid-ite driver on the
second USB interface (the mouse interface) and their touchpad only supports
mouse emulation, so using generic hid-input handling for anything but
the "Wireless Radio Control" bits is fine. On these devices we simply bind
to all USB interfaces.

But unlike other ITE8595 using keyboard docks, the Acer Aspire Switch 10
(SW5-012)'s touchpad not only does mouse emulation it also supports
HID-multitouch and all the keys including the "Wireless Radio Control"
bits have been moved to the first USB interface (the keyboard intf).

So we need hid-ite to handle the first (keyboard) USB interface and have
it NOT bind to the second (mouse) USB interface so that that can be
handled by hid-multitouch.c and we get proper multi-touch support.

This commit changes the hid_device_id for the SW5-012 keyboard dock to
only match on hid devices from the HID_GROUP_GENERIC group, this way
hid-ite will not bind the the mouse/multi-touch interface which has
HID_GROUP_MULTITOUCH_WIN_8 as group.
This fixes the regression to mouse-emulation mode introduced by adding
the keyboard dock USB id.

Cc: stable@vger.kernel.org
Fixes: 8f18eca9ebc5 ("HID: ite: Add USB id match for Acer SW5-012 keyboard dock")
Reported-by: Zdeněk Rampas <zdenda.rampas@gmail.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-ite.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-ite.c b/drivers/hid/hid-ite.c
index c436e12feb23..6c55682c5974 100644
--- a/drivers/hid/hid-ite.c
+++ b/drivers/hid/hid-ite.c
@@ -41,8 +41,9 @@ static const struct hid_device_id ite_devices[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_ITE, USB_DEVICE_ID_ITE8595) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_258A, USB_DEVICE_ID_258A_6A88) },
 	/* ITE8595 USB kbd ctlr, with Synaptics touchpad connected to it. */
-	{ HID_USB_DEVICE(USB_VENDOR_ID_SYNAPTICS,
-			 USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012) },
+	{ HID_DEVICE(BUS_USB, HID_GROUP_GENERIC,
+		     USB_VENDOR_ID_SYNAPTICS,
+		     USB_DEVICE_ID_SYNAPTICS_ACER_SWITCH5_012) },
 	{ }
 };
 MODULE_DEVICE_TABLE(hid, ite_devices);

From e20d8e81a0e06c672e964c9f01100f07a64b1ce6 Mon Sep 17 00:00:00 2001
From: Brendan Higgins <brendanhiggins@google.com>
Date: Fri, 31 Jan 2020 16:01:02 -0800
Subject: [PATCH 003/239] Documentation: kunit: fixed sphinx error in code
 block

Fix a missing newline in a code block that was causing a warning:

Documentation/dev-tools/kunit/usage.rst:553: WARNING: Error in "code-block" directive:
maximum 1 argument(s) allowed, 3 supplied.

.. code-block:: bash
        modprobe example-test

Signed-off-by: Brendan Higgins <brendanhiggins@google.com>
Reviewed-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 Documentation/dev-tools/kunit/usage.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/dev-tools/kunit/usage.rst b/Documentation/dev-tools/kunit/usage.rst
index 7cd56a1993b1..607758a66a99 100644
--- a/Documentation/dev-tools/kunit/usage.rst
+++ b/Documentation/dev-tools/kunit/usage.rst
@@ -551,6 +551,7 @@ options to your ``.config``:
 Once the kernel is built and installed, a simple
 
 .. code-block:: bash
+
 	modprobe example-test
 
 ...will run the tests.

From 0cbb4f9c69827decf56519c2f63918f16904ede5 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Mon, 3 Feb 2020 09:46:19 -0800
Subject: [PATCH 004/239] platform/chrome: wilco_ec: Include asm/unaligned
 instead of linux/ path

It seems that we shouldn't try to include the include/linux/ path to
unaligned functions. Just include asm/unaligned.h instead so that we
don't run into compilation warnings like below.

   In file included from drivers/platform/chrome/wilco_ec/properties.c:8:0:
   include/linux/unaligned/le_memmove.h:7:19: error: redefinition of 'get_unaligned_le16'
    static inline u16 get_unaligned_le16(const void *p)
                      ^~~~~~~~~~~~~~~~~~
   In file included from arch/ia64/include/asm/unaligned.h:5:0,
                    from arch/ia64/include/asm/io.h:23,
                    from arch/ia64/include/asm/smp.h:21,
                    from include/linux/smp.h:68,
                    from include/linux/percpu.h:7,
                    from include/linux/arch_topology.h:9,
                    from include/linux/topology.h:30,
                    from include/linux/gfp.h:9,
                    from include/linux/xarray.h:14,
                    from include/linux/radix-tree.h:18,
                    from include/linux/idr.h:15,
                    from include/linux/kernfs.h:13,
                    from include/linux/sysfs.h:16,
                    from include/linux/kobject.h:20,
                    from include/linux/device.h:16,
                    from include/linux/platform_data/wilco-ec.h:11,
                    from drivers/platform/chrome/wilco_ec/properties.c:6:
   include/linux/unaligned/le_struct.h:7:19: note: previous definition of 'get_unaligned_le16' was here
    static inline u16 get_unaligned_le16(const void *p)
                      ^~~~~~~~~~~~~~~~~~
Reported-by: kbuild test robot <lkp@intel.com>
Fixes: 60fb8a8e93ca ("platform/chrome: wilco_ec: Allow wilco to be compiled in COMPILE_TEST")
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
---
 drivers/platform/chrome/wilco_ec/properties.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/platform/chrome/wilco_ec/properties.c b/drivers/platform/chrome/wilco_ec/properties.c
index e69682c95ea2..62f27610dd33 100644
--- a/drivers/platform/chrome/wilco_ec/properties.c
+++ b/drivers/platform/chrome/wilco_ec/properties.c
@@ -5,7 +5,7 @@
 
 #include <linux/platform_data/wilco-ec.h>
 #include <linux/string.h>
-#include <linux/unaligned/le_memmove.h>
+#include <asm/unaligned.h>
 
 /* Operation code; what the EC should do with the property */
 enum ec_property_op {

From e433be929e63265b7412478eb7ff271467aee2d7 Mon Sep 17 00:00:00 2001
From: Mansour Behabadi <mansour@oxplot.com>
Date: Wed, 29 Jan 2020 17:26:31 +1100
Subject: [PATCH 005/239] HID: apple: Add support for recent firmware on Magic
 Keyboards

Magic Keyboards with more recent firmware (0x0100) report Fn key differently.
Without this patch, Fn key may not behave as expected and may not be
configurable via hid_apple fnmode module parameter.

Signed-off-by: Mansour Behabadi <mansour@oxplot.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-apple.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index 6ac8becc2372..d732d1d10caf 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -340,7 +340,8 @@ static int apple_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		unsigned long **bit, int *max)
 {
 	if (usage->hid == (HID_UP_CUSTOM | 0x0003) ||
-			usage->hid == (HID_UP_MSVENDOR | 0x0003)) {
+			usage->hid == (HID_UP_MSVENDOR | 0x0003) ||
+			usage->hid == (HID_UP_HPVENDOR2 | 0x0003)) {
 		/* The fn key on Apple USB keyboards */
 		set_bit(EV_REP, hi->input->evbit);
 		hid_map_usage_clear(hi, usage, bit, max, EV_KEY, KEY_FN);

From 5ebdffd25098898aff1249ae2f7dbfddd76d8f8f Mon Sep 17 00:00:00 2001
From: Johan Korsnes <jkorsnes@cisco.com>
Date: Fri, 17 Jan 2020 13:08:35 +0100
Subject: [PATCH 006/239] HID: core: fix off-by-one memset in
 hid_report_raw_event()

In case a report is greater than HID_MAX_BUFFER_SIZE, it is truncated,
but the report-number byte is not correctly handled. This results in a
off-by-one in the following memset, causing a kernel Oops and ensuing
system crash.

Note: With commit 8ec321e96e05 ("HID: Fix slab-out-of-bounds read in
hid_field_extract") I no longer hit the kernel Oops as we instead fail
"controlled" at probe if there is a report too long in the HID
report-descriptor. hid_report_raw_event() is an exported symbol, so
presumabely we cannot always rely on this being the case.

Fixes: 966922f26c7f ("HID: fix a crash in hid_report_raw_event()
                     function.")
Signed-off-by: Johan Korsnes <jkorsnes@cisco.com>
Cc: Armando Visconti <armando.visconti@st.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 851fe54ea59e..359616e3efbb 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1741,7 +1741,9 @@ int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, u32 size,
 
 	rsize = ((report->size - 1) >> 3) + 1;
 
-	if (rsize > HID_MAX_BUFFER_SIZE)
+	if (report_enum->numbered && rsize >= HID_MAX_BUFFER_SIZE)
+		rsize = HID_MAX_BUFFER_SIZE - 1;
+	else if (rsize > HID_MAX_BUFFER_SIZE)
 		rsize = HID_MAX_BUFFER_SIZE;
 
 	if (csize < rsize) {

From 84a4062632462c4320704fcdf8e99e89e94c0aba Mon Sep 17 00:00:00 2001
From: Johan Korsnes <jkorsnes@cisco.com>
Date: Fri, 17 Jan 2020 13:08:36 +0100
Subject: [PATCH 007/239] HID: core: increase HID report buffer size to 8KiB

We have a HID touch device that reports its opens and shorts test
results in HID buffers of size 8184 bytes. The maximum size of the HID
buffer is currently set to 4096 bytes, causing probe of this device to
fail. With this patch we increase the maximum size of the HID buffer to
8192 bytes, making device probe and acquisition of said buffers succeed.

Signed-off-by: Johan Korsnes <jkorsnes@cisco.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Armando Visconti <armando.visconti@st.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/hid.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/hid.h b/include/linux/hid.h
index cd41f209043f..875f71132b14 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -492,7 +492,7 @@ struct hid_report_enum {
 };
 
 #define HID_MIN_BUFFER_SIZE	64		/* make sure there is at least a packet size of space */
-#define HID_MAX_BUFFER_SIZE	4096		/* 4kb */
+#define HID_MAX_BUFFER_SIZE	8192		/* 8kb */
 #define HID_CONTROL_FIFO_SIZE	256		/* to init devices with >100 reports */
 #define HID_OUTPUT_FIFO_SIZE	64
 

From 5c02c447eaeda29d3da121a2e17b97ccaf579b51 Mon Sep 17 00:00:00 2001
From: "dan.carpenter@oracle.com" <dan.carpenter@oracle.com>
Date: Wed, 15 Jan 2020 20:46:28 +0300
Subject: [PATCH 008/239] HID: hiddev: Fix race in in hiddev_disconnect()

Syzbot reports that "hiddev" is used after it's free in hiddev_disconnect().
The hiddev_disconnect() function sets "hiddev->exist = 0;" so
hiddev_release() can free it as soon as we drop the "existancelock"
lock.  This patch moves the mutex_unlock(&hiddev->existancelock) until
after we have finished using it.

Reported-by: syzbot+784ccb935f9900cc7c9e@syzkaller.appspotmail.com
Fixes: 7f77897ef2b6 ("HID: hiddev: fix potential use-after-free")
Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hiddev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index a970b809d778..4140dea693e9 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -932,9 +932,9 @@ void hiddev_disconnect(struct hid_device *hid)
 	hiddev->exist = 0;
 
 	if (hiddev->open) {
-		mutex_unlock(&hiddev->existancelock);
 		hid_hw_close(hiddev->hid);
 		wake_up_interruptible(&hiddev->wait);
+		mutex_unlock(&hiddev->existancelock);
 	} else {
 		mutex_unlock(&hiddev->existancelock);
 		kfree(hiddev);

From 8d2e77b39b8fecb794e19cd006a12f90b14dd077 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Wed, 4 Dec 2019 04:35:25 +0100
Subject: [PATCH 009/239] HID: alps: Fix an error handling path in
 'alps_input_configured()'

They are issues:
   - if 'input_allocate_device()' fails and return NULL, there is no need
     to free anything and 'input_free_device()' call is a no-op. It can
     be axed.
   - 'ret' is known to be 0 at this point, so we must set it to a
     meaningful value before returning

Fixes: 2562756dde55 ("HID: add Alps I2C HID Touchpad-Stick support")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-alps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hid/hid-alps.c b/drivers/hid/hid-alps.c
index ae79a7c66737..fa704153cb00 100644
--- a/drivers/hid/hid-alps.c
+++ b/drivers/hid/hid-alps.c
@@ -730,7 +730,7 @@ static int alps_input_configured(struct hid_device *hdev, struct hid_input *hi)
 	if (data->has_sp) {
 		input2 = input_allocate_device();
 		if (!input2) {
-			input_free_device(input2);
+			ret = -ENOMEM;
 			goto exit;
 		}
 

From ead68df94d248c80fdbae220ae5425eb5af2e753 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 12 Feb 2020 20:15:29 +0100
Subject: [PATCH 010/239] KVM: x86: enable -Werror

Avoid more embarrassing mistakes.  At least those that the compiler
can catch.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b19ef421084d..4654e97a05cc 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ccflags-y += -Iarch/x86/kvm
+ccflags-y += -Werror
 
 KVM := ../../../virt/kvm
 

From c14335ebb92a98646ddbf447e6cacc66de5269ad Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 9 Feb 2020 21:12:02 -0800
Subject: [PATCH 011/239] scsi: Revert "target/core: Inline
 transport_lun_remove_cmd()"

Commit 83f85b8ec305 postponed the percpu_ref_put(&se_cmd->se_lun->lun_ref)
call from command completion to the time when the final command reference
is dropped. That approach is not compatible with the iSCSI target driver
because the iSCSI target driver keeps the command with the highest stat_sn
after it has completed until the next command is received (see also
iscsit_ack_from_expstatsn()). Fix this regression by reverting commit
83f85b8ec305.

Fixes: 83f85b8ec305 ("scsi: target/core: Inline transport_lun_remove_cmd()")
Cc: Pavel Zakharov <pavel.zakharov@delphix.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/20200210051202.12934-1-bvanassche@acm.org
Reported-by: Pavel Zakharov <pavel.zakharov@delphix.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/target_core_transport.c | 31 +++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index ea482d4b1f00..0ae9e60fc4d5 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -666,6 +666,11 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
 
 	target_remove_from_state_list(cmd);
 
+	/*
+	 * Clear struct se_cmd->se_lun before the handoff to FE.
+	 */
+	cmd->se_lun = NULL;
+
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
 	/*
 	 * Determine if frontend context caller is requesting the stopping of
@@ -693,6 +698,17 @@ static int transport_cmd_check_stop_to_fabric(struct se_cmd *cmd)
 	return cmd->se_tfo->check_stop_free(cmd);
 }
 
+static void transport_lun_remove_cmd(struct se_cmd *cmd)
+{
+	struct se_lun *lun = cmd->se_lun;
+
+	if (!lun)
+		return;
+
+	if (cmpxchg(&cmd->lun_ref_active, true, false))
+		percpu_ref_put(&lun->lun_ref);
+}
+
 static void target_complete_failure_work(struct work_struct *work)
 {
 	struct se_cmd *cmd = container_of(work, struct se_cmd, work);
@@ -783,6 +799,8 @@ static void target_handle_abort(struct se_cmd *cmd)
 
 	WARN_ON_ONCE(kref_read(&cmd->cmd_kref) == 0);
 
+	transport_lun_remove_cmd(cmd);
+
 	transport_cmd_check_stop_to_fabric(cmd);
 }
 
@@ -1708,6 +1726,7 @@ static void target_complete_tmr_failure(struct work_struct *work)
 	se_cmd->se_tmr_req->response = TMR_LUN_DOES_NOT_EXIST;
 	se_cmd->se_tfo->queue_tm_rsp(se_cmd);
 
+	transport_lun_remove_cmd(se_cmd);
 	transport_cmd_check_stop_to_fabric(se_cmd);
 }
 
@@ -1898,6 +1917,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 		goto queue_full;
 
 check_stop:
+	transport_lun_remove_cmd(cmd);
 	transport_cmd_check_stop_to_fabric(cmd);
 	return;
 
@@ -2195,6 +2215,7 @@ static void transport_complete_qf(struct se_cmd *cmd)
 		transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 		return;
 	}
+	transport_lun_remove_cmd(cmd);
 	transport_cmd_check_stop_to_fabric(cmd);
 }
 
@@ -2289,6 +2310,7 @@ static void target_complete_ok_work(struct work_struct *work)
 		if (ret)
 			goto queue_full;
 
+		transport_lun_remove_cmd(cmd);
 		transport_cmd_check_stop_to_fabric(cmd);
 		return;
 	}
@@ -2314,6 +2336,7 @@ static void target_complete_ok_work(struct work_struct *work)
 			if (ret)
 				goto queue_full;
 
+			transport_lun_remove_cmd(cmd);
 			transport_cmd_check_stop_to_fabric(cmd);
 			return;
 		}
@@ -2349,6 +2372,7 @@ static void target_complete_ok_work(struct work_struct *work)
 			if (ret)
 				goto queue_full;
 
+			transport_lun_remove_cmd(cmd);
 			transport_cmd_check_stop_to_fabric(cmd);
 			return;
 		}
@@ -2384,6 +2408,7 @@ static void target_complete_ok_work(struct work_struct *work)
 		break;
 	}
 
+	transport_lun_remove_cmd(cmd);
 	transport_cmd_check_stop_to_fabric(cmd);
 	return;
 
@@ -2710,6 +2735,9 @@ int transport_generic_free_cmd(struct se_cmd *cmd, int wait_for_tasks)
 		 */
 		if (cmd->state_active)
 			target_remove_from_state_list(cmd);
+
+		if (cmd->se_lun)
+			transport_lun_remove_cmd(cmd);
 	}
 	if (aborted)
 		cmd->free_compl = &compl;
@@ -2781,9 +2809,6 @@ static void target_release_cmd_kref(struct kref *kref)
 	struct completion *abrt_compl = se_cmd->abrt_compl;
 	unsigned long flags;
 
-	if (se_cmd->lun_ref_active)
-		percpu_ref_put(&se_cmd->se_lun->lun_ref);
-
 	if (se_sess) {
 		spin_lock_irqsave(&se_sess->sess_cmd_lock, flags);
 		list_del_init(&se_cmd->se_cmd_list);

From 0e99b2c625da181aebf1a3d13493e3f7a5057a9c Mon Sep 17 00:00:00 2001
From: Tomas Henzl <thenzl@redhat.com>
Date: Tue, 4 Feb 2020 16:24:13 +0100
Subject: [PATCH 012/239] scsi: megaraid_sas: silence a warning

Add a flag to DMA memory allocation to silence a warning.

This driver allocates DMA memory for IO frames. This allocation may exceed
MAX_ORDER pages for few megaraid_sas controllers (controllers with very
high queue depth). Consequently, the driver has logic to keep reducing the
controller queue depth until the DMA memory allocation succeeds.

On impacted megaraid_sas controllers there would be multiple DMA allocation
failures until driver settled on an allocation that fit. These failed DMA
allocation requests caused stack traces in system logs. These were not
harmful and this patch silences those warnings/stack traces.

[mkp: clarified commit desc]

Link: https://lore.kernel.org/r/20200204152413.7107-1-thenzl@redhat.com
Signed-off-by: Tomas Henzl <thenzl@redhat.com>
Acked-by: Sumit Saxena <sumit.saxena@broadcom.com>
Reviewed-by: Lee Duncan <lduncan@suse.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index f3b36fd0a0eb..b2ad96564484 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -623,7 +623,8 @@ megasas_alloc_request_fusion(struct megasas_instance *instance)
 
 	fusion->io_request_frames =
 			dma_pool_alloc(fusion->io_request_frames_pool,
-				GFP_KERNEL, &fusion->io_request_frames_phys);
+				GFP_KERNEL | __GFP_NOWARN,
+				&fusion->io_request_frames_phys);
 	if (!fusion->io_request_frames) {
 		if (instance->max_fw_cmds >= (MEGASAS_REDUCE_QD_COUNT * 2)) {
 			instance->max_fw_cmds -= MEGASAS_REDUCE_QD_COUNT;
@@ -661,7 +662,7 @@ megasas_alloc_request_fusion(struct megasas_instance *instance)
 
 		fusion->io_request_frames =
 			dma_pool_alloc(fusion->io_request_frames_pool,
-				       GFP_KERNEL,
+				       GFP_KERNEL | __GFP_NOWARN,
 				       &fusion->io_request_frames_phys);
 
 		if (!fusion->io_request_frames) {

From 872d92dec353a8d30fa186892cd5ea3e17ca75d3 Mon Sep 17 00:00:00 2001
From: Hongbo Yao <yaohongbo@huawei.com>
Date: Wed, 22 Jan 2020 17:12:38 +0800
Subject: [PATCH 013/239] tee: amdtee: amdtee depends on CRYPTO_DEV_CCP_DD

If CRYPTO_DEV_CCP_DD=m and AMDTEE=y, the following error is seen
while building call.c or core.c

drivers/tee/amdtee/call.o: In function `handle_unload_ta':
call.c:(.text+0x35f): undefined reference to `psp_tee_process_cmd'
drivers/tee/amdtee/core.o: In function `amdtee_driver_init':
core.c:(.init.text+0xf): undefined reference to `psp_check_tee_status

Fix the config dependency for AMDTEE here.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: 757cc3e9ff1d ("tee: add AMD-TEE driver")
Signed-off-by: Hongbo Yao <yaohongbo@huawei.com>
Reviewed-by: Rijo Thomas <Rijo-john.Thomas@amd.com>
Acked-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/tee/amdtee/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/tee/amdtee/Kconfig b/drivers/tee/amdtee/Kconfig
index 4e32b6413b41..191f9715fa9a 100644
--- a/drivers/tee/amdtee/Kconfig
+++ b/drivers/tee/amdtee/Kconfig
@@ -3,6 +3,6 @@
 config AMDTEE
 	tristate "AMD-TEE"
 	default m
-	depends on CRYPTO_DEV_SP_PSP
+	depends on CRYPTO_DEV_SP_PSP && CRYPTO_DEV_CCP_DD
 	help
 	  This implements AMD's Trusted Execution Environment (TEE) driver.

From 6e5cf31fbe651bed7ba1df768f2e123531132417 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 4 Feb 2020 13:28:41 +0100
Subject: [PATCH 014/239] x86/mce/amd: Publish the bank pointer only after
 setup has succeeded

threshold_create_bank() creates a bank descriptor per MCA error
thresholding counter which can be controlled over sysfs. It publishes
the pointer to that bank in a per-CPU variable and then goes on to
create additional thresholding blocks if the bank has such.

However, that creation of additional blocks in
allocate_threshold_blocks() can fail, leading to a use-after-free
through the per-CPU pointer.

Therefore, publish that pointer only after all blocks have been setup
successfully.

Fixes: 019f34fccfd5 ("x86, MCE, AMD: Move shared bank to node descriptor")
Reported-by: Saar Amar <Saar.Amar@microsoft.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20200128140846.phctkvx5btiexvbx@kili.mountain
---
 arch/x86/kernel/cpu/mce/amd.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index b3a50d962851..e7313e5c497c 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1198,8 +1198,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 	return buf_mcatype;
 }
 
-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
-				     unsigned int block, u32 address)
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+				     unsigned int bank, unsigned int block,
+				     u32 address)
 {
 	struct threshold_block *b = NULL;
 	u32 low, high;
@@ -1243,16 +1244,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 
 	INIT_LIST_HEAD(&b->miscj);
 
-	if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
-		list_add(&b->miscj,
-			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
-	} else {
-		per_cpu(threshold_banks, cpu)[bank]->blocks = b;
-	}
+	if (tb->blocks)
+		list_add(&b->miscj, &tb->blocks->miscj);
+	else
+		tb->blocks = b;
 
-	err = kobject_init_and_add(&b->kobj, &threshold_ktype,
-				   per_cpu(threshold_banks, cpu)[bank]->kobj,
-				   get_name(bank, b));
+	err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
 	if (err)
 		goto out_free;
 recurse:
@@ -1260,7 +1257,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (!address)
 		return 0;
 
-	err = allocate_threshold_blocks(cpu, bank, block, address);
+	err = allocate_threshold_blocks(cpu, tb, bank, block, address);
 	if (err)
 		goto out_free;
 
@@ -1345,8 +1342,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out_free;
 	}
 
-	per_cpu(threshold_banks, cpu)[bank] = b;
-
 	if (is_shared_bank(bank)) {
 		refcount_set(&b->cpus, 1);
 
@@ -1357,9 +1352,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		}
 	}
 
-	err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
-	if (!err)
-		goto out;
+	err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+	if (err)
+		goto out_free;
+
+	per_cpu(threshold_banks, cpu)[bank] = b;
+
+	return 0;
 
  out_free:
 	kfree(b);

From c9cc0517bba9f0213f1e55172feceb99e5512daf Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 6 Feb 2020 12:42:01 +0100
Subject: [PATCH 015/239] crypto: chacha20poly1305 - prevent integer overflow
 on large input

This code assigns src_len (size_t) to sl (int), which causes problems
when src_len is very large. Probably nobody in the kernel should be
passing this much data to chacha20poly1305 all in one go anyway, so I
don't think we need to change the algorithm or introduce larger types
or anything. But we should at least error out early in this case and
print a warning so that we get reports if this does happen and can look
into why anybody is possibly passing it that much data or if they're
accidently passing -1 or similar.

Fixes: d95312a3ccc0 ("crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine")
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: stable@vger.kernel.org # 5.5+
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 lib/crypto/chacha20poly1305.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index 6d83cafebc69..ad0699ce702f 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -235,6 +235,9 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 		__le64 lens[2];
 	} b __aligned(16);
 
+	if (WARN_ON(src_len > INT_MAX))
+		return false;
+
 	chacha_load_key(b.k, key);
 
 	b.iv[0] = 0;

From 51dede9c05df2b78acd6dcf6a17d21f0877d2d7b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Feb 2020 19:01:34 +0100
Subject: [PATCH 016/239] x86/mce/amd: Fix kobject lifetime

Accessing the MCA thresholding controls in sysfs concurrently with CPU
hotplug can lead to a couple of KASAN-reported issues:

  BUG: KASAN: use-after-free in sysfs_file_ops+0x155/0x180
  Read of size 8 at addr ffff888367578940 by task grep/4019

and

  BUG: KASAN: use-after-free in show_error_count+0x15c/0x180
  Read of size 2 at addr ffff888368a05514 by task grep/4454

for example. Both result from the fact that the threshold block
creation/teardown code frees the descriptor memory itself instead of
defining proper ->release function and leaving it to the driver core to
take care of that, after all sysfs accesses have completed.

Do that and get rid of the custom freeing code, fixing the above UAFs in
the process.

  [ bp: write commit message. ]

Fixes: 95268664390b ("[PATCH] x86_64: mce_amd support for family 0x10 processors")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200214082801.13836-1-bp@alien8.de
---
 arch/x86/kernel/cpu/mce/amd.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e7313e5c497c..52de616a8065 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1163,9 +1163,12 @@ static const struct sysfs_ops threshold_ops = {
 	.store			= store,
 };
 
+static void threshold_block_release(struct kobject *kobj);
+
 static struct kobj_type threshold_ktype = {
 	.sysfs_ops		= &threshold_ops,
 	.default_attrs		= default_attrs,
+	.release		= threshold_block_release,
 };
 
 static const char *get_name(unsigned int bank, struct threshold_block *b)
@@ -1367,8 +1370,12 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	return err;
 }
 
-static void deallocate_threshold_block(unsigned int cpu,
-						 unsigned int bank)
+static void threshold_block_release(struct kobject *kobj)
+{
+	kfree(to_block(kobj));
+}
+
+static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
 {
 	struct threshold_block *pos = NULL;
 	struct threshold_block *tmp = NULL;
@@ -1378,13 +1385,11 @@ static void deallocate_threshold_block(unsigned int cpu,
 		return;
 
 	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
-		kobject_put(&pos->kobj);
 		list_del(&pos->miscj);
-		kfree(pos);
+		kobject_put(&pos->kobj);
 	}
 
-	kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
-	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+	kobject_put(&head->blocks->kobj);
 }
 
 static void __threshold_remove_blocks(struct threshold_bank *b)

From cba6437a1854fde5934098ec3bd0ee83af3129f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Feb 2020 12:19:41 +0100
Subject: [PATCH 017/239] genirq/proc: Reject invalid affinity masks (again)

Qian Cai reported that the WARN_ON() in the x86/msi affinity setting code,
which catches cases where the affinity setting is not done on the CPU which
is the current target of the interrupt, triggers during CPU hotplug stress
testing.

It turns out that the warning which was added with the commit addressing
the MSI affinity race unearthed yet another long standing bug.

If user space writes a bogus affinity mask, i.e. it contains no online CPUs,
then it calls irq_select_affinity_usr(). This was introduced for ALPHA in

  eee45269b0f5 ("[PATCH] Alpha: convert to generic irq framework (generic part)")

and subsequently made available for all architectures in

  18404756765c ("genirq: Expose default irq affinity mask (take 3)")

which introduced the circumvention of the affinity setting restrictions for
interrupt which cannot be moved in process context.

The whole exercise is bogus in various aspects:

  1) If the interrupt is already started up then there is absolutely
     no point to honour a bogus interrupt affinity setting from user
     space. The interrupt is already assigned to an online CPU and it
     does not make any sense to reassign it to some other randomly
     chosen online CPU.

  2) If the interupt is not yet started up then there is no point
     either. A subsequent startup of the interrupt will invoke
     irq_setup_affinity() anyway which will chose a valid target CPU.

So the only correct solution is to just return -EINVAL in case user space
wrote an affinity mask which does not contain any online CPUs, except for
ALPHA which has it's own magic sauce for this.

Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)")
Reported-by: Qian Cai <cai@lca.pw>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Qian Cai <cai@lca.pw>
Link: https://lkml.kernel.org/r/878sl8xdbm.fsf@nanos.tec.linutronix.de
---
 kernel/irq/internals.h |  2 --
 kernel/irq/manage.c    | 18 ++----------------
 kernel/irq/proc.c      | 22 ++++++++++++++++++++++
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 3924fbe829d4..c9d8eb7f5c02 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq,
 
 extern bool irq_can_set_affinity_usr(unsigned int irq);
 
-extern int irq_select_affinity_usr(unsigned int irq);
-
 extern void irq_set_thread_affinity(struct irq_desc *desc);
 
 extern int irq_do_set_affinity(struct irq_data *data,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3089a60ea8f9..7eee98c38f25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc)
 {
 	return irq_select_affinity(irq_desc_get_irq(desc));
 }
-#endif
+#endif /* CONFIG_AUTO_IRQ_AFFINITY */
+#endif /* CONFIG_SMP */
 
-/*
- * Called when a bogus affinity is set via /proc/irq
- */
-int irq_select_affinity_usr(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	unsigned long flags;
-	int ret;
-
-	raw_spin_lock_irqsave(&desc->lock, flags);
-	ret = irq_setup_affinity(desc);
-	raw_spin_unlock_irqrestore(&desc->lock, flags);
-	return ret;
-}
-#endif
 
 /**
  *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9e5783d98033..32c071d7bc03 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 	return show_irq_affinity(AFFINITY_LIST, m);
 }
 
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+	/*
+	 * If the interrupt is started up already then this fails. The
+	 * interrupt is assigned to an online CPU already. There is no
+	 * point to move it around randomly. Tell user space that the
+	 * selected mask is bogus.
+	 *
+	 * If not then any change to the affinity is pointless because the
+	 * startup code invokes irq_setup_affinity() which will select
+	 * a online CPU anyway.
+	 */
+	return -EINVAL;
+}
+#else
+/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
+static inline int irq_select_affinity_usr(unsigned int irq)
+{
+	return irq_select_affinity(irq);
+}
+#endif
 
 static ssize_t write_irq_affinity(int type, struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)

From be0aba826c4a6ba5929def1962a90d6127871969 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Fri, 14 Feb 2020 14:53:07 +0800
Subject: [PATCH 018/239] HID: i2c-hid: add Trekstor Surfbook E11B to
 descriptor override

The Surfbook E11B uses the SIPODEV SP1064 touchpad, which does not supply
descriptors, so it has to be added to the override list.

BugLink: https://bugs.launchpad.net/bugs/1858299
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c b/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c
index d31ea82b84c1..a66f08041a1a 100644
--- a/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c
+++ b/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c
@@ -341,6 +341,14 @@ static const struct dmi_system_id i2c_hid_dmi_desc_override_table[] = {
 		},
 		.driver_data = (void *)&sipodev_desc
 	},
+	{
+		.ident = "Trekstor SURFBOOK E11B",
+		.matches = {
+			DMI_EXACT_MATCH(DMI_SYS_VENDOR, "TREKSTOR"),
+			DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "SURFBOOK E11B"),
+		},
+		.driver_data = (void *)&sipodev_desc
+	},
 	{
 		.ident = "Direkt-Tek DTLAPY116-2",
 		.matches = {

From 76261ada16dcc3be610396a46d35acc3efbda682 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 12 Feb 2020 21:08:59 -0800
Subject: [PATCH 019/239] scsi: Revert "RDMA/isert: Fix a recently introduced
 regression related to logout"

Since commit 04060db41178 introduces soft lockups when toggling network
interfaces, revert it.

Link: https://marc.info/?l=target-devel&m=158157054906196
Cc: Rahul Kundu <rahul.kundu@chelsio.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Reported-by: Dakshaja Uppalapati <dakshaja@chelsio.com>
Fixes: 04060db41178 ("scsi: RDMA/isert: Fix a recently introduced regression related to logout")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c | 12 ++++++++++++
 drivers/target/iscsi/iscsi_target.c     |  6 +++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index b273e421e910..a1a035270cab 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -2575,6 +2575,17 @@ isert_wait4logout(struct isert_conn *isert_conn)
 	}
 }
 
+static void
+isert_wait4cmds(struct iscsi_conn *conn)
+{
+	isert_info("iscsi_conn %p\n", conn);
+
+	if (conn->sess) {
+		target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+		target_wait_for_sess_cmds(conn->sess->se_sess);
+	}
+}
+
 /**
  * isert_put_unsol_pending_cmds() - Drop commands waiting for
  *     unsolicitate dataout
@@ -2622,6 +2633,7 @@ static void isert_wait_conn(struct iscsi_conn *conn)
 
 	ib_drain_qp(isert_conn->qp);
 	isert_put_unsol_pending_cmds(conn);
+	isert_wait4cmds(conn);
 	isert_wait4logout(isert_conn);
 
 	queue_work(isert_release_wq, &isert_conn->release_work);
diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index b94ed4e30770..7251a87bb576 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -4149,6 +4149,9 @@ int iscsit_close_connection(
 	iscsit_stop_nopin_response_timer(conn);
 	iscsit_stop_nopin_timer(conn);
 
+	if (conn->conn_transport->iscsit_wait_conn)
+		conn->conn_transport->iscsit_wait_conn(conn);
+
 	/*
 	 * During Connection recovery drop unacknowledged out of order
 	 * commands for this connection, and prepare the other commands
@@ -4234,9 +4237,6 @@ int iscsit_close_connection(
 	target_sess_cmd_list_set_waiting(sess->se_sess);
 	target_wait_for_sess_cmds(sess->se_sess);
 
-	if (conn->conn_transport->iscsit_wait_conn)
-		conn->conn_transport->iscsit_wait_conn(conn);
-
 	ahash_request_free(conn->conn_tx_hash);
 	if (conn->conn_rx_hash) {
 		struct crypto_ahash *tfm;

From 807b9515b7d044cf77df31f1af9d842a76ecd5cb Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 12 Feb 2020 21:09:00 -0800
Subject: [PATCH 020/239] scsi: Revert "target: iscsi: Wait for all commands to
 finish before freeing a session"

Since commit e9d3009cb936 introduced a regression and since the fix for
that regression was not perfect, revert this commit.

Link: https://marc.info/?l=target-devel&m=158157054906195
Cc: Rahul Kundu <rahul.kundu@chelsio.com>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Reported-by: Dakshaja Uppalapati <dakshaja@chelsio.com>
Fixes: e9d3009cb936 ("scsi: target: iscsi: Wait for all commands to finish before freeing a session")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/target/iscsi/iscsi_target.c | 10 ++--------
 include/scsi/iscsi_proto.h          |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index 7251a87bb576..09e55ea0bf5d 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -1165,9 +1165,7 @@ int iscsit_setup_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
 		hdr->cmdsn, be32_to_cpu(hdr->data_length), payload_length,
 		conn->cid);
 
-	if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
-		return iscsit_add_reject_cmd(cmd,
-				ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
+	target_get_sess_cmd(&cmd->se_cmd, true);
 
 	cmd->sense_reason = transport_lookup_cmd_lun(&cmd->se_cmd,
 						     scsilun_to_int(&hdr->lun));
@@ -2004,9 +2002,7 @@ iscsit_handle_task_mgt_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
 			      conn->sess->se_sess, 0, DMA_NONE,
 			      TCM_SIMPLE_TAG, cmd->sense_buffer + 2);
 
-	if (target_get_sess_cmd(&cmd->se_cmd, true) < 0)
-		return iscsit_add_reject_cmd(cmd,
-				ISCSI_REASON_WAITING_FOR_LOGOUT, buf);
+	target_get_sess_cmd(&cmd->se_cmd, true);
 
 	/*
 	 * TASK_REASSIGN for ERL=2 / connection stays inside of
@@ -4234,8 +4230,6 @@ int iscsit_close_connection(
 	 * must wait until they have completed.
 	 */
 	iscsit_check_conn_usage_count(conn);
-	target_sess_cmd_list_set_waiting(sess->se_sess);
-	target_wait_for_sess_cmds(sess->se_sess);
 
 	ahash_request_free(conn->conn_tx_hash);
 	if (conn->conn_rx_hash) {
diff --git a/include/scsi/iscsi_proto.h b/include/scsi/iscsi_proto.h
index 533f56733ba8..b71b5c4f418c 100644
--- a/include/scsi/iscsi_proto.h
+++ b/include/scsi/iscsi_proto.h
@@ -627,7 +627,6 @@ struct iscsi_reject {
 #define ISCSI_REASON_BOOKMARK_INVALID	9
 #define ISCSI_REASON_BOOKMARK_NO_RESOURCES	10
 #define ISCSI_REASON_NEGOTIATION_RESET	11
-#define ISCSI_REASON_WAITING_FOR_LOGOUT	12
 
 /* Max. number of Key=Value pairs in a text message */
 #define MAX_KEY_VALUE_PAIRS	8192

From d3f703c4359ff06619b2322b91f69710453e6b6d Mon Sep 17 00:00:00 2001
From: Victor Kamensky <kamensky@cisco.com>
Date: Tue, 11 Feb 2020 11:24:33 -0800
Subject: [PATCH 021/239] mips: vdso: fix 'jalr t9' crash in vdso code

Observed that when kernel is built with Yocto mips64-poky-linux-gcc,
and mips64-poky-linux-gnun32-gcc toolchain, resulting vdso contains
'jalr t9' instructions in its code and since in vdso case nobody
sets GOT table code crashes when instruction reached. On other hand
observed that when kernel is built mips-poky-linux-gcc toolchain, the
same 'jalr t9' instruction are replaced with PC relative function
calls using 'bal' instructions.

The difference boils down to -mrelax-pic-calls and -mexplicit-relocs
gcc options that gets different default values depending on gcc
target triplets and corresponding binutils. -mrelax-pic-calls got
enabled by default only in mips-poky-linux-gcc case. MIPS binutils
ld relies on R_MIPS_JALR relocation to convert 'jalr t9' into 'bal'
and such relocation is generated only if -mrelax-pic-calls option
is on.

Please note 'jalr t9' conversion to 'bal' can happen only to static
functions. These static PIC calls use mips local GOT entries that
are supposed to be filled with start of DSO value by run-time linker
(missing in VDSO case) and they do not have dynamic relocations.
Global mips GOT entries must have dynamic relocations and they should
be prevented by cmd_vdso_check Makefile rule.

Solution call out -mrelax-pic-calls and -mexplicit-relocs options
explicitly while compiling MIPS vdso code. That would get correct
and consistent between different toolchains behaviour.

Reported-by: Bruce Ashfield <bruce.ashfield@gmail.com>
Signed-off-by: Victor Kamensky <kamensky@cisco.com>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: richard.purdie@linuxfoundation.org
---
 arch/mips/vdso/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index aa89a41dc5dd..848baeaef1f8 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -33,6 +33,7 @@ endif
 cflags-vdso := $(ccflags-vdso) \
 	$(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
 	-O3 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \
+	-mrelax-pic-calls -mexplicit-relocs \
 	-fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \
 	$(call cc-option, -fno-asynchronous-unwind-tables) \
 	$(call cc-option, -fno-stack-protector)

From 07015d7a103c4420b69a287b8ef4d2535c0f4106 Mon Sep 17 00:00:00 2001
From: Paul Burton <paulburton@kernel.org>
Date: Sat, 15 Feb 2020 12:38:36 -0800
Subject: [PATCH 022/239] MIPS: Disable VDSO time functionality on microMIPS

A check we're about to add to pick up on function calls that depend on
bogus use of the GOT in the VDSO picked up on instances of such function
calls in microMIPS builds. Since the code appears genuinely problematic,
and given the relatively small amount of use & testing that microMIPS
sees, go ahead & disable the VDSO for microMIPS builds.

Signed-off-by: Paul Burton <paulburton@kernel.org>
---
 arch/mips/vdso/Makefile | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 848baeaef1f8..b5e0bd82d47f 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -52,6 +52,8 @@ endif
 
 CFLAGS_REMOVE_vgettimeofday.o = -pg
 
+DISABLE_VDSO := n
+
 #
 # For the pre-R6 code in arch/mips/vdso/vdso.h for locating
 # the base address of VDSO, the linker will emit a R_MIPS_PC32
@@ -65,11 +67,24 @@ CFLAGS_REMOVE_vgettimeofday.o = -pg
 ifndef CONFIG_CPU_MIPSR6
   ifeq ($(call ld-ifversion, -lt, 225000000, y),y)
     $(warning MIPS VDSO requires binutils >= 2.25)
-    obj-vdso-y := $(filter-out vgettimeofday.o, $(obj-vdso-y))
-    ccflags-vdso += -DDISABLE_MIPS_VDSO
+    DISABLE_VDSO := y
   endif
 endif
 
+#
+# GCC (at least up to version 9.2) appears to emit function calls that make use
+# of the GOT when targeting microMIPS, which we can't use in the VDSO due to
+# the lack of relocations. As such, we disable the VDSO for microMIPS builds.
+#
+ifdef CONFIG_CPU_MICROMIPS
+  DISABLE_VDSO := y
+endif
+
+ifeq ($(DISABLE_VDSO),y)
+  obj-vdso-y := $(filter-out vgettimeofday.o, $(obj-vdso-y))
+  ccflags-vdso += -DDISABLE_MIPS_VDSO
+endif
+
 # VDSO linker flags.
 VDSO_LDFLAGS := \
 	-Wl,-Bsymbolic -Wl,--no-undefined -Wl,-soname=linux-vdso.so.1 \

From 976c23af3ee5bd3447a7bfb6c356ceb4acf264a6 Mon Sep 17 00:00:00 2001
From: Victor Kamensky <kamensky@cisco.com>
Date: Tue, 11 Feb 2020 11:24:34 -0800
Subject: [PATCH 023/239] mips: vdso: add build time check that no 'jalr t9'
 calls left

vdso shared object cannot have GOT based PIC 'jalr t9' calls
because nobody set GOT table in vdso. Contributing into vdso
.o files are compiled in PIC mode and as result for internal
static functions calls compiler will generate 'jalr t9'
instructions. Those are supposed to be converted into PC
relative 'bal' calls by linker when relocation are processed.

Mips global GOT entries do have dynamic relocations and they
will be caught by cmd_vdso_check Makefile rule. Static PIC
calls go through mips local GOT entries that do not have
dynamic relocations. For those 'jalr t9' calls could be present
but without dynamic relocations and they need to be converted
to 'bal' calls by linker.

Add additional build time check to make sure that no 'jalr t9'
slip through because of some toolchain misconfiguration that
prevents 'jalr t9' to 'bal' conversion.

Signed-off-by: Victor Kamensky <kamensky@cisco.com>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: bruce.ashfield@gmail.com
Cc: richard.purdie@linuxfoundation.org
---
 arch/mips/vdso/Makefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index b5e0bd82d47f..77374c1f0c77 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -97,12 +97,18 @@ GCOV_PROFILE := n
 UBSAN_SANITIZE := n
 KCOV_INSTRUMENT := n
 
+# Check that we don't have PIC 'jalr t9' calls left
+quiet_cmd_vdso_mips_check = VDSOCHK $@
+      cmd_vdso_mips_check = if $(OBJDUMP) --disassemble $@ | egrep -h "jalr.*t9" > /dev/null; \
+		       then (echo >&2 "$@: PIC 'jalr t9' calls are not supported"; \
+			     rm -f $@; /bin/false); fi
+
 #
 # Shared build commands.
 #
 
 quiet_cmd_vdsold_and_vdso_check = LD      $@
-      cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check)
+      cmd_vdsold_and_vdso_check = $(cmd_vdsold); $(cmd_vdso_check); $(cmd_vdso_mips_check)
 
 quiet_cmd_vdsold = VDSO    $@
       cmd_vdsold = $(CC) $(c_flags) $(VDSO_LDFLAGS) \

From 97e914b7de3c943011779b979b8093fdc0d85722 Mon Sep 17 00:00:00 2001
From: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Date: Wed, 12 Feb 2020 10:24:55 +1300
Subject: [PATCH 024/239] MIPS: cavium_octeon: Fix syncw generation.

The Cavium Octeon CPU uses a special sync instruction for implementing
wmb, and due to a CPU bug, the instruction must appear twice. A macro
had been defined to hide this:

 #define __SYNC_rpt(type)     (1 + (type == __SYNC_wmb))

which was intended to evaluate to 2 for __SYNC_wmb, and 1 for any other
type of sync. However, this expression is evaluated by the assembler,
and not the compiler, and the result of '==' in the assembler is 0 or
-1, not 0 or 1 as it is in C. The net result was wmb() producing no code
at all. The simple fix in this patch is to change the '+' to '-'.

Fixes: bf92927251b3 ("MIPS: barrier: Add __SYNC() infrastructure")
Signed-off-by: Mark Tomlinson <mark.tomlinson@alliedtelesis.co.nz>
Tested-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/include/asm/sync.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/mips/include/asm/sync.h b/arch/mips/include/asm/sync.h
index 7c6a1095f556..aabd097933fe 100644
--- a/arch/mips/include/asm/sync.h
+++ b/arch/mips/include/asm/sync.h
@@ -155,9 +155,11 @@
  * effective barrier as noted by commit 6b07d38aaa52 ("MIPS: Octeon: Use
  * optimized memory barrier primitives."). Here we specify that the affected
  * sync instructions should be emitted twice.
+ * Note that this expression is evaluated by the assembler (not the compiler),
+ * and that the assembler evaluates '==' as 0 or -1, not 0 or 1.
  */
 #ifdef CONFIG_CPU_CAVIUM_OCTEON
-# define __SYNC_rpt(type)	(1 + (type == __SYNC_wmb))
+# define __SYNC_rpt(type)	(1 - (type == __SYNC_wmb))
 #else
 # define __SYNC_rpt(type)	1
 #endif

From bef8e2dfceed6daeb6ca3e8d33f9c9d43b926580 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sun, 2 Feb 2020 21:19:22 +0100
Subject: [PATCH 025/239] MIPS: VPE: Fix a double free and a memory leak in
 'release_vpe()'

Pointer on the memory allocated by 'alloc_progmem()' is stored in
'v->load_addr'. So this is this memory that should be freed by
'release_progmem()'.

'release_progmem()' is only a call to 'kfree()'.

With the current code, there is both a double free and a memory leak.
Fix it by passing the correct pointer to 'release_progmem()'.

Fixes: e01402b115ccc ("More AP / SP bits for the 34K, the Malta bits and things. Still wants")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: ralf@linux-mips.org
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel-janitors@vger.kernel.org
---
 arch/mips/kernel/vpe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 6176b9acba95..d0d832ab3d3b 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -134,7 +134,7 @@ void release_vpe(struct vpe *v)
 {
 	list_del(&v->list);
 	if (v->load_addr)
-		release_progmem(v);
+		release_progmem(v->load_addr);
 	kfree(v);
 }
 

From 7fbeb95d0f68e21e6ca61284f1ac681630976947 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 16 Feb 2020 01:01:18 +0300
Subject: [PATCH 026/239] io_uring: add missing io_req_cancelled()

fallocate_finish() is missing cancellation check. Add it.
It's safe to do that, as only flags setup and sqe fields copy are done
before it gets into __io_fallocate().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5a826017ebb8..29565d82291f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2517,6 +2517,9 @@ static void io_fallocate_finish(struct io_wq_work **workptr)
 	struct io_kiocb *nxt = NULL;
 	int ret;
 
+	if (io_req_cancelled(req))
+		return;
+
 	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 				req->sync.len);
 	if (ret < 0)
@@ -2904,6 +2907,7 @@ static void io_close_finish(struct io_wq_work **workptr)
 	struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
 	struct io_kiocb *nxt = NULL;
 
+	/* not cancellable, don't do io_req_cancelled() */
 	__io_close_finish(req, &nxt);
 	if (nxt)
 		io_wq_assign_next(workptr, nxt);

From d4f194ed9eb9841a8f978710e4d24296f791a85b Mon Sep 17 00:00:00 2001
From: Sam Bobroff <sbobroff@linux.ibm.com>
Date: Fri, 7 Feb 2020 15:57:31 +1100
Subject: [PATCH 027/239] powerpc/eeh: Fix deadlock handling dead PHB

Recovering a dead PHB can currently cause a deadlock as the PCI
rescan/remove lock is taken twice.

This is caused as part of an existing bug in
eeh_handle_special_event(). The pe is processed while traversing the
PHBs even though the pe is unrelated to the loop. This causes the pe
to be, incorrectly, processed more than once.

Untangling this section can move the pe processing out of the loop and
also outside the locked section, correcting both problems.

Fixes: 2e25505147b8 ("powerpc/eeh: Fix crash when edev->pdev changes")
Cc: stable@vger.kernel.org # 5.4+
Signed-off-by: Sam Bobroff <sbobroff@linux.ibm.com>
Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com>
Tested-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/0547e82dbf90ee0729a2979a8cac5c91665c621f.1581051445.git.sbobroff@linux.ibm.com
---
 arch/powerpc/kernel/eeh_driver.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a1eaffe868de..7b048cee767c 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1184,6 +1184,17 @@ void eeh_handle_special_event(void)
 			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
 			eeh_handle_normal_event(pe);
 		} else {
+			eeh_for_each_pe(pe, tmp_pe)
+				eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
+					edev->mode &= ~EEH_DEV_NO_HANDLER;
+
+			/* Notify all devices to be down */
+			eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
+			eeh_set_channel_state(pe, pci_channel_io_perm_failure);
+			eeh_pe_report(
+				"error_detected(permanent failure)", pe,
+				eeh_report_failure, NULL);
+
 			pci_lock_rescan_remove();
 			list_for_each_entry(hose, &hose_list, list_node) {
 				phb_pe = eeh_phb_pe_get(hose);
@@ -1192,16 +1203,6 @@ void eeh_handle_special_event(void)
 				    (phb_pe->state & EEH_PE_RECOVERING))
 					continue;
 
-				eeh_for_each_pe(pe, tmp_pe)
-					eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
-						edev->mode &= ~EEH_DEV_NO_HANDLER;
-
-				/* Notify all devices to be down */
-				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
-				eeh_set_channel_state(pe, pci_channel_io_perm_failure);
-				eeh_pe_report(
-					"error_detected(permanent failure)", pe,
-					eeh_report_failure, NULL);
 				bus = eeh_pe_bus_get(phb_pe);
 				if (!bus) {
 					pr_err("%s: Cannot find PCI bus for "

From f2b67ef90b0d5eca0f2255e02cf2f620bc0ddcdb Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 6 Feb 2020 13:50:28 +0000
Subject: [PATCH 028/239] powerpc/hugetlb: Fix 512k hugepages on 8xx with 16k
 page size

Commit 55c8fc3f4930 ("powerpc/8xx: reintroduce 16K pages with HW
assistance") redefined pte_t as a struct of 4 pte_basic_t, because
in 16K pages mode there are four identical entries in the
page table. But the size of hugepage tables is calculated based
of the size of (void *). Therefore, we end up with page tables
of size 1k instead of 4k for 512k pages.

As 512k hugepage tables are the same size as standard page tables,
ie 4k, use the standard page tables instead of PGT_CACHE tables.

Fixes: 3fb69c6a1a13 ("powerpc/8xx: Enable 512k hugepage support with HW assistance")
Cc: stable@vger.kernel.org # v5.0+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/90ec56a2315be602494619ed0223bba3b0b8d619.1580997007.git.christophe.leroy@c-s.fr
---
 arch/powerpc/mm/hugetlbpage.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 73d4873fc7f8..33b3461d91e8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -53,20 +53,24 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	if (pshift >= pdshift) {
 		cachep = PGT_CACHE(PTE_T_ORDER);
 		num_hugepd = 1 << (pshift - pdshift);
+		new = NULL;
 	} else if (IS_ENABLED(CONFIG_PPC_8xx)) {
-		cachep = PGT_CACHE(PTE_INDEX_SIZE);
+		cachep = NULL;
 		num_hugepd = 1;
+		new = pte_alloc_one(mm);
 	} else {
 		cachep = PGT_CACHE(pdshift - pshift);
 		num_hugepd = 1;
+		new = NULL;
 	}
 
-	if (!cachep) {
+	if (!cachep && !new) {
 		WARN_ONCE(1, "No page table cache created for hugetlb tables");
 		return -ENOMEM;
 	}
 
-	new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
+	if (cachep)
+		new = kmem_cache_alloc(cachep, pgtable_gfp_flags(mm, GFP_KERNEL));
 
 	BUG_ON(pshift > HUGEPD_SHIFT_MASK);
 	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
@@ -97,7 +101,10 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	if (i < num_hugepd) {
 		for (i = i - 1 ; i >= 0; i--, hpdp--)
 			*hpdp = __hugepd(0);
-		kmem_cache_free(cachep, new);
+		if (cachep)
+			kmem_cache_free(cachep, new);
+		else
+			pte_free(mm, new);
 	} else {
 		kmemleak_ignore(new);
 	}
@@ -324,8 +331,7 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
 	if (shift >= pdshift)
 		hugepd_free(tlb, hugepte);
 	else if (IS_ENABLED(CONFIG_PPC_8xx))
-		pgtable_free_tlb(tlb, hugepte,
-				 get_hugepd_cache_index(PTE_INDEX_SIZE));
+		pgtable_free_tlb(tlb, hugepte, 0);
 	else
 		pgtable_free_tlb(tlb, hugepte,
 				 get_hugepd_cache_index(pdshift - shift));
@@ -639,12 +645,13 @@ static int __init hugetlbpage_init(void)
 		 * if we have pdshift and shift value same, we don't
 		 * use pgt cache for hugepd.
 		 */
-		if (pdshift > shift && IS_ENABLED(CONFIG_PPC_8xx))
-			pgtable_cache_add(PTE_INDEX_SIZE);
-		else if (pdshift > shift)
-			pgtable_cache_add(pdshift - shift);
-		else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) || IS_ENABLED(CONFIG_PPC_8xx))
+		if (pdshift > shift) {
+			if (!IS_ENABLED(CONFIG_PPC_8xx))
+				pgtable_cache_add(pdshift - shift);
+		} else if (IS_ENABLED(CONFIG_PPC_FSL_BOOK3E) ||
+			   IS_ENABLED(CONFIG_PPC_8xx)) {
 			pgtable_cache_add(PTE_T_ORDER);
+		}
 
 		configured = true;
 	}

From 50a175dd18de7a647e72aca7daf4744e3a5a81e3 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Sun, 9 Feb 2020 16:02:41 +0000
Subject: [PATCH 029/239] powerpc/hugetlb: Fix 8M hugepages on 8xx

With HW assistance all page tables must be 4k aligned, the 8xx drops
the last 12 bits during the walk.

Redefine HUGEPD_SHIFT_MASK to mask last 12 bits out. HUGEPD_SHIFT_MASK
is used to for alignment of page table cache.

Fixes: 22569b881d37 ("powerpc/8xx: Enable 8M hugepage support with HW assistance")
Cc: stable@vger.kernel.org # v5.0+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/778b1a248c4c7ca79640eeff7740044da6a220a0.1581264115.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/page.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 86332080399a..080a0bf8e54b 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -295,8 +295,13 @@ static inline bool pfn_valid(unsigned long pfn)
 /*
  * Some number of bits at the level of the page table that points to
  * a hugepte are used to encode the size.  This masks those bits.
+ * On 8xx, HW assistance requires 4k alignment for the hugepte.
  */
+#ifdef CONFIG_PPC_8xx
+#define HUGEPD_SHIFT_MASK     0xfff
+#else
 #define HUGEPD_SHIFT_MASK     0x3f
+#endif
 
 #ifndef __ASSEMBLY__
 

From a4031afb9d10d97f4d0285844abbc0ab04245304 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Sun, 9 Feb 2020 18:14:42 +0000
Subject: [PATCH 030/239] powerpc/8xx: Fix clearing of bits 20-23 in ITLB miss

In ITLB miss handled the line supposed to clear bits 20-23 on the L2
ITLB entry is buggy and does indeed nothing, leading to undefined
value which could allow execution when it shouldn't.

Properly do the clearing with the relevant instruction.

Fixes: 74fabcadfd43 ("powerpc/8xx: don't use r12/SPRN_SPRG_SCRATCH2 in TLB Miss handlers")
Cc: stable@vger.kernel.org # v5.0+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Leonardo Bras <leonardo@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/4f70c2778163affce8508a210f65d140e84524b4.1581272050.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/head_8xx.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9922306ae512..073a651787df 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -256,7 +256,7 @@ InstructionTLBMiss:
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
 	 */
-	rlwimi	r10, r10, 0, 0x0f00	/* Clear bits 20-23 */
+	rlwinm	r10, r10, 0, ~0x0f00	/* Clear bits 20-23 */
 	rlwimi	r10, r10, 4, 0x0400	/* Copy _PAGE_EXEC into bit 21 */
 	ori	r10, r10, RPN_PATTERN | 0x200 /* Set 22 and 24-27 */
 	mtspr	SPRN_MI_RPN, r10	/* Update TLB entry */

From d970a325561da5e611596cbb06475db3755ce823 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 13 Feb 2020 18:22:55 +0100
Subject: [PATCH 031/239] KVM: x86: fix missing prototypes

Reported with "make W=1" due to -Wmissing-prototypes.

Reported-by: Qian Cai <cai@lca.pw>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e89eb67356cb..7944ad6ac10b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -889,6 +889,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_post_init_vm(struct kvm *kvm);
+void kvm_arch_pre_destroy_vm(struct kvm *kvm);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 /*

From 463bfeeead97416ad2b141421f51888054dc0e18 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 14 Feb 2020 10:44:05 +0800
Subject: [PATCH 032/239] KVM: nVMX: Fix some obsolete comments and grammar
 error

Fix wrong variable names and grammar error in comment.

Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3589cd3c0fcc..a5757b0b80f9 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3161,10 +3161,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
  *
  * Returns:
- *	NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
- *	NVMX_ENTRY_VMFAIL:  Consistency check VMFail
- *	NVMX_ENTRY_VMEXIT:  Consistency check VMExit
- *	NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
+ *	NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
+ *	NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
+ *	NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
+ *	NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
  */
 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
 							bool from_vmentry)
@@ -5330,7 +5330,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 }
 
 /*
- * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access,
  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
  * disinterest in the current event (read or write a specific MSR) by using an
  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.

From 8b101a5e14f2161869636ff9cb4907b7749dc0c2 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Fri, 24 Jan 2020 08:48:55 +0300
Subject: [PATCH 033/239] s390/cio: cio_ignore_proc_seq_next should increase
 position index

if seq_file .next fuction does not change position index,
read after some lseek can generate unexpected output.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=206283
Link: https://lore.kernel.org/r/d44c53a7-9bc1-15c7-6d4a-0c10cb9dffce@virtuozzo.com
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 drivers/s390/cio/blacklist.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index da642e811f7f..4dd2eb634856 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -303,8 +303,10 @@ static void *
 cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
 {
 	struct ccwdev_iter *iter;
+	loff_t p = *offset;
 
-	if (*offset >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
+	(*offset)++;
+	if (p >= (__MAX_SUBCHANNEL + 1) * (__MAX_SSID + 1))
 		return NULL;
 	iter = it;
 	if (iter->devno == __MAX_SUBCHANNEL) {
@@ -314,7 +316,6 @@ cio_ignore_proc_seq_next(struct seq_file *s, void *it, loff_t *offset)
 			return NULL;
 	} else
 		iter->devno++;
-	(*offset)++;
 	return iter;
 }
 

From b16c3724dd717a354d0f0257fd647ef5518982aa Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 13 Feb 2020 11:13:13 -0500
Subject: [PATCH 034/239] s390/defconfig: enable
 CONFIG_PROTECTED_VIRTUALIZATION_GUEST

The guest support for protected virtualization is default on most
distributions.

Also refresh defconfig and debug_defconfig.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/configs/debug_defconfig | 28 +++++++++++++---------------
 arch/s390/configs/defconfig       | 11 +++++------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 2e60c80395ab..0c86ba19fa2b 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m
 CONFIG_CRASH_DUMP=y
 CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
+CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
@@ -474,7 +475,6 @@ CONFIG_NLMON=m
 # CONFIG_NET_VENDOR_EMULEX is not set
 # CONFIG_NET_VENDOR_EZCHIP is not set
 # CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_HUAWEI is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -684,7 +684,6 @@ CONFIG_CRYPTO_ADIANTUM=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
 CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
@@ -748,7 +747,6 @@ CONFIG_DEBUG_INFO_DWARF4=y
 CONFIG_GDB_SCRIPTS=y
 CONFIG_FRAME_WARN=1024
 CONFIG_HEADERS_INSTALL=y
-CONFIG_HEADERS_CHECK=y
 CONFIG_DEBUG_SECTION_MISMATCH=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_PAGEALLOC=y
@@ -772,9 +770,9 @@ CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
 CONFIG_DEBUG_PER_CPU_MAPS=y
 CONFIG_DEBUG_SHIRQ=y
+CONFIG_PANIC_ON_OOPS=y
 CONFIG_DETECT_HUNG_TASK=y
 CONFIG_WQ_WATCHDOG=y
-CONFIG_PANIC_ON_OOPS=y
 CONFIG_DEBUG_TIMEKEEPING=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_LOCK_STAT=y
@@ -783,9 +781,20 @@ CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
 CONFIG_DEBUG_SG=y
 CONFIG_DEBUG_NOTIFIERS=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_DEBUG_CREDENTIALS=y
 CONFIG_RCU_TORTURE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=300
+CONFIG_LATENCYTOP=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_PREEMPT_TRACER=y
+CONFIG_SCHED_TRACER=y
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_HIST_TRIGGERS=y
+CONFIG_S390_PTDUMP=y
 CONFIG_NOTIFIER_ERROR_INJECTION=m
 CONFIG_NETDEV_NOTIFIER_ERROR_INJECT=m
 CONFIG_FAULT_INJECTION=y
@@ -796,15 +805,6 @@ CONFIG_FAIL_IO_TIMEOUT=y
 CONFIG_FAIL_FUTEX=y
 CONFIG_FAULT_INJECTION_DEBUG_FS=y
 CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
-CONFIG_LATENCYTOP=y
-CONFIG_IRQSOFF_TRACER=y
-CONFIG_PREEMPT_TRACER=y
-CONFIG_SCHED_TRACER=y
-CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
-CONFIG_HIST_TRIGGERS=y
 CONFIG_LKDTM=m
 CONFIG_TEST_LIST_SORT=y
 CONFIG_TEST_SORT=y
@@ -814,5 +814,3 @@ CONFIG_INTERVAL_TREE_TEST=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 25f799849582..6b27d861a9a3 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -53,6 +53,7 @@ CONFIG_VFIO_AP=m
 CONFIG_CRASH_DUMP=y
 CONFIG_HIBERNATION=y
 CONFIG_PM_DEBUG=y
+CONFIG_PROTECTED_VIRTUALIZATION_GUEST=y
 CONFIG_CMM=m
 CONFIG_APPLDATA_BASE=y
 CONFIG_KVM=m
@@ -470,7 +471,6 @@ CONFIG_NLMON=m
 # CONFIG_NET_VENDOR_EMULEX is not set
 # CONFIG_NET_VENDOR_EZCHIP is not set
 # CONFIG_NET_VENDOR_GOOGLE is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_HUAWEI is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -677,7 +677,6 @@ CONFIG_CRYPTO_ADIANTUM=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
 CONFIG_CRYPTO_CRC32=m
-CONFIG_CRYPTO_XXHASH=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
@@ -739,18 +738,18 @@ CONFIG_DEBUG_SECTION_MISMATCH=y
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_MEMORY_INIT=y
 CONFIG_PANIC_ON_OOPS=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
 CONFIG_RCU_TORTURE_TEST=m
 CONFIG_RCU_CPU_STALL_TIMEOUT=60
 CONFIG_LATENCYTOP=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_STACK_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
-CONFIG_STACK_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_FUNCTION_PROFILER=y
 CONFIG_HIST_TRIGGERS=y
+CONFIG_S390_PTDUMP=y
 CONFIG_LKDTM=m
 CONFIG_PERCPU_TEST=m
 CONFIG_ATOMIC64_SELFTEST=y
 CONFIG_TEST_BPF=m
-CONFIG_BUG_ON_DATA_CORRUPTION=y
-CONFIG_S390_PTDUMP=y

From 0d730b57b95f7e703bb5789de91a7ece6d3a68ac Mon Sep 17 00:00:00 2001
From: chenqiwu <chenqiwu@xiaomi.com>
Date: Fri, 14 Feb 2020 21:51:33 +0800
Subject: [PATCH 035/239] s390/cio: use kobj_to_dev() API

Use kobj_to_dev() API instead of container_of().

Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: chenqiwu <chenqiwu@xiaomi.com>
Signed-off-by: chenqiwu <qiwuchen55@gmail.com>
Message-Id: <1581688293-17283-1-git-send-email-qiwuchen55@gmail.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 drivers/s390/cio/chp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index 51038ec309c1..dfcbe54591fb 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -135,7 +135,7 @@ static ssize_t chp_measurement_chars_read(struct file *filp,
 	struct channel_path *chp;
 	struct device *device;
 
-	device = container_of(kobj, struct device, kobj);
+	device = kobj_to_dev(kobj);
 	chp = to_channelpath(device);
 	if (chp->cmg == -1)
 		return 0;
@@ -184,7 +184,7 @@ static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj,
 	struct device *device;
 	unsigned int size;
 
-	device = container_of(kobj, struct device, kobj);
+	device = kobj_to_dev(kobj);
 	chp = to_channelpath(device);
 	css = to_css(chp->dev.parent);
 

From 05ccaca003e4f26aceb7f075c073a49159af6e9e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 12 Feb 2020 09:46:22 +0100
Subject: [PATCH 036/239] s390/pkey/zcrypt: spelling s/crytp/crypt/

Fix typos in a comments.

Link: https://lkml.kernel.org/r/20200212084622.9219-1-geert+renesas@glider.be
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 drivers/s390/crypto/zcrypt_ep11misc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_ep11misc.c b/drivers/s390/crypto/zcrypt_ep11misc.c
index d4caf46ff9df..2afe2153b34e 100644
--- a/drivers/s390/crypto/zcrypt_ep11misc.c
+++ b/drivers/s390/crypto/zcrypt_ep11misc.c
@@ -887,7 +887,7 @@ static int ep11_unwrapkey(u16 card, u16 domain,
 	/* empty pin tag */
 	*p++ = 0x04;
 	*p++ = 0;
-	/* encrytped key value tag and bytes */
+	/* encrypted key value tag and bytes */
 	p += asn1tag_write(p, 0x04, enckey, enckeysize);
 
 	/* reply cprb and payload */
@@ -1095,7 +1095,7 @@ int ep11_clr2keyblob(u16 card, u16 domain, u32 keybitsize, u32 keygenflags,
 
 	/* Step 1: generate AES 256 bit random kek key */
 	rc = ep11_genaeskey(card, domain, 256,
-			    0x00006c00, /* EN/DECRYTP, WRAP/UNWRAP */
+			    0x00006c00, /* EN/DECRYPT, WRAP/UNWRAP */
 			    kek, &keklen);
 	if (rc) {
 		DEBUG_ERR(

From 380324734956c64cd060e1db4304f3117ac15809 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Thu, 13 Feb 2020 23:42:07 -0700
Subject: [PATCH 037/239] s390/mm: Explicitly compare PAGE_DEFAULT_KEY against
 zero in storage_key_init_range

Clang warns:

 In file included from ../arch/s390/purgatory/purgatory.c:10:
 In file included from ../include/linux/kexec.h:18:
 In file included from ../include/linux/crash_core.h:6:
 In file included from ../include/linux/elfcore.h:5:
 In file included from ../include/linux/user.h:1:
 In file included from ../arch/s390/include/asm/user.h:11:
 ../arch/s390/include/asm/page.h:45:6: warning: converting the result of
 '<<' to a boolean always evaluates to false
 [-Wtautological-constant-compare]
         if (PAGE_DEFAULT_KEY)
            ^
 ../arch/s390/include/asm/page.h:23:44: note: expanded from macro
 'PAGE_DEFAULT_KEY'
 #define PAGE_DEFAULT_KEY        (PAGE_DEFAULT_ACC << 4)
                                                  ^
 1 warning generated.

Explicitly compare this against zero to silence the warning as it is
intended to be used in a boolean context.

Fixes: de3fa841e429 ("s390/mm: fix compile for PAGE_DEFAULT_KEY != 0")
Link: https://github.com/ClangBuiltLinux/linux/issues/860
Link: https://lkml.kernel.org/r/20200214064207.10381-1-natechancellor@gmail.com
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/page.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 85e944f04c70..1019efd85b9d 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -42,7 +42,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end);
 
 static inline void storage_key_init_range(unsigned long start, unsigned long end)
 {
-	if (PAGE_DEFAULT_KEY)
+	if (PAGE_DEFAULT_KEY != 0)
 		__storage_key_init_range(start, end);
 }
 

From 788d671517b5c81efbed9310ccbadb8cca86a08e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Sat, 8 Feb 2020 07:10:52 -0700
Subject: [PATCH 038/239] s390/kaslr: Fix casts in get_random

Clang warns:

../arch/s390/boot/kaslr.c:78:25: warning: passing 'char *' to parameter
of type 'const u8 *' (aka 'const unsigned char *') converts between
pointers to integer
types with different sign [-Wpointer-sign]
                                  (char *) entropy, (char *) entropy,
                                                    ^~~~~~~~~~~~~~~~
../arch/s390/include/asm/cpacf.h:280:28: note: passing argument to
parameter 'src' here
                            u8 *dest, const u8 *src, long src_len)
                                                ^
2 warnings generated.

Fix the cast to match what else is done in this function.

Fixes: b2d24b97b2a9 ("s390/kernel: add support for kernel address space layout randomization (KASLR)")
Link: https://github.com/ClangBuiltLinux/linux/issues/862
Link: https://lkml.kernel.org/r/20200208141052.48476-1-natechancellor@gmail.com
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/boot/kaslr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c
index 5d12352545c5..5591243d673e 100644
--- a/arch/s390/boot/kaslr.c
+++ b/arch/s390/boot/kaslr.c
@@ -75,7 +75,7 @@ static unsigned long get_random(unsigned long limit)
 		*(unsigned long *) prng.parm_block ^= seed;
 		for (i = 0; i < 16; i++) {
 			cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block,
-				  (char *) entropy, (char *) entropy,
+				  (u8 *) entropy, (u8 *) entropy,
 				  sizeof(entropy));
 			memcpy(prng.parm_block, entropy, sizeof(entropy));
 		}

From 94e90f727f7424d827256023cace829cad6896f4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 16 Feb 2020 23:48:29 +0900
Subject: [PATCH 039/239] s390: make 'install' not depend on vmlinux

For the same reason as commit 19514fc665ff ("arm, kbuild: make "make
install" not depend on vmlinux"), the install targets should never
trigger the rebuild of the kernel.

The variable, CONFIGURE, is not set by anyone. Remove it as well.

Link: https://lkml.kernel.org/r/20200216144829.27023-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/Makefile      | 2 +-
 arch/s390/boot/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/Makefile b/arch/s390/Makefile
index e0e3a465bbfd..8dfa2cf1f05c 100644
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -146,7 +146,7 @@ all: bzImage
 #KBUILD_IMAGE is necessary for packaging targets like rpm-pkg, deb-pkg...
 KBUILD_IMAGE	:= $(boot)/bzImage
 
-install: vmlinux
+install:
 	$(Q)$(MAKE) $(build)=$(boot) $@
 
 bzImage: vmlinux
diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile
index e2c47d3a1c89..0ff9261c915e 100644
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -70,7 +70,7 @@ $(obj)/compressed/vmlinux: $(obj)/startup.a FORCE
 $(obj)/startup.a: $(OBJECTS) FORCE
 	$(call if_changed,ar)
 
-install: $(CONFIGURE) $(obj)/bzImage
+install:
 	sh -x  $(srctree)/$(obj)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
 	      System.map "$(INSTALL_PATH)"
 

From 0a923a76d615b4f54456ce6c52865bb0d5b5516d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 16 Feb 2020 20:44:09 -0800
Subject: [PATCH 040/239] Documentation/hwmon: fix xdpe12284 Sphinx warnings

Fix Sphinx format warnings by adding a blank line.

Documentation/hwmon/xdpe12284.rst:28: WARNING: Unexpected indentation.
Documentation/hwmon/xdpe12284.rst:29: WARNING: Block quote ends without a blank line; unexpected unindent.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jean Delvare <jdelvare@suse.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: linux-hwmon@vger.kernel.org
Cc: Vadim Pasternak <vadimp@mellanox.com>
Link: https://lore.kernel.org/r/0094c570-dd4c-dc0e-386d-ea1c39b6a582@infradead.org
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/hwmon/xdpe12284.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/hwmon/xdpe12284.rst b/Documentation/hwmon/xdpe12284.rst
index 6b7ae98cc536..67d1f87808e5 100644
--- a/Documentation/hwmon/xdpe12284.rst
+++ b/Documentation/hwmon/xdpe12284.rst
@@ -24,6 +24,7 @@ This driver implements support for Infineon Multi-phase XDPE122 family
 dual loop voltage regulators.
 The family includes XDPE12284 and XDPE12254 devices.
 The devices from this family complaint with:
+
 - Intel VR13 and VR13HC rev 1.3, IMVP8 rev 1.2 and IMPVP9 rev 1.3 DC-DC
   converter specification.
 - Intel SVID rev 1.9. protocol.

From 2464cc4c345699adea52c7aef75707207cb8a2f6 Mon Sep 17 00:00:00 2001
From: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Date: Tue, 11 Feb 2020 00:38:29 -0300
Subject: [PATCH 041/239] powerpc/tm: Fix clearing MSR[TS] in current when
 reclaiming on signal delivery

After a treclaim, we expect to be in non-transactional state. If we
don't clear the current thread's MSR[TS] before we get preempted, then
tm_recheckpoint_new_task() will recheckpoint and we get rescheduled in
suspended transaction state.

When handling a signal caught in transactional state,
handle_rt_signal64() calls get_tm_stackpointer() that treclaims the
transaction using tm_reclaim_current() but without clearing the
thread's MSR[TS]. This can cause the TM Bad Thing exception below if
later we pagefault and get preempted trying to access the user's
sigframe, using __put_user(). Afterwards, when we are rescheduled back
into do_page_fault() (but now in suspended state since the thread's
MSR[TS] was not cleared), upon executing 'rfid' after completion of
the page fault handling, the exception is raised because a transition
from suspended to non-transactional state is invalid.

  Unexpected TM Bad Thing exception at c00000000000de44 (msr 0x8000000302a03031) tm_scratch=800000010280b033
  Oops: Unrecoverable exception, sig: 6 [#1]
  LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
  CPU: 25 PID: 15547 Comm: a.out Not tainted 5.4.0-rc2 #32
  NIP:  c00000000000de44 LR: c000000000034728 CTR: 0000000000000000
  REGS: c00000003fe7bd70 TRAP: 0700   Not tainted  (5.4.0-rc2)
  MSR:  8000000302a03031 <SF,VEC,VSX,FP,ME,IR,DR,LE,TM[SE]>  CR: 44000884  XER: 00000000
  CFAR: c00000000000dda4 IRQMASK: 0
  PACATMSCRATCH: 800000010280b033
  GPR00: c000000000034728 c000000f65a17c80 c000000001662800 00007fffacf3fd78
  GPR04: 0000000000001000 0000000000001000 0000000000000000 c000000f611f8af0
  GPR08: 0000000000000000 0000000078006001 0000000000000000 000c000000000000
  GPR12: c000000f611f84b0 c00000003ffcb200 0000000000000000 0000000000000000
  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR20: 0000000000000000 0000000000000000 0000000000000000 c000000f611f8140
  GPR24: 0000000000000000 00007fffacf3fd68 c000000f65a17d90 c000000f611f7800
  GPR28: c000000f65a17e90 c000000f65a17e90 c000000001685e18 00007fffacf3f000
  NIP [c00000000000de44] fast_exception_return+0xf4/0x1b0
  LR [c000000000034728] handle_rt_signal64+0x78/0xc50
  Call Trace:
  [c000000f65a17c80] [c000000000034710] handle_rt_signal64+0x60/0xc50 (unreliable)
  [c000000f65a17d30] [c000000000023640] do_notify_resume+0x330/0x460
  [c000000f65a17e20] [c00000000000dcc4] ret_from_except_lite+0x70/0x74
  Instruction dump:
  7c4ff120 e8410170 7c5a03a6 38400000 f8410060 e8010070 e8410080 e8610088
  60000000 60000000 e8810090 e8210078 <4c000024> 48000000 e8610178 88ed0989
  ---[ end trace 93094aa44b442f87 ]---

The simplified sequence of events that triggers the above exception is:

  ...				# userspace in NON-TRANSACTIONAL state
  tbegin			# userspace in TRANSACTIONAL state
  signal delivery		# kernelspace in SUSPENDED state
  handle_rt_signal64()
    get_tm_stackpointer()
      treclaim			# kernelspace in NON-TRANSACTIONAL state
    __put_user()
      page fault happens. We will never get back here because of the TM Bad Thing exception.

  page fault handling kicks in and we voluntarily preempt ourselves
  do_page_fault()
    __schedule()
      __switch_to(other_task)

  our task is rescheduled and we recheckpoint because the thread's MSR[TS] was not cleared
  __switch_to(our_task)
    switch_to_tm()
      tm_recheckpoint_new_task()
        trechkpt			# kernelspace in SUSPENDED state

  The page fault handling resumes, but now we are in suspended transaction state
  do_page_fault()    completes
  rfid     <----- trying to get back where the page fault happened (we were non-transactional back then)
  TM Bad Thing			# illegal transition from suspended to non-transactional

This patch fixes that issue by clearing the current thread's MSR[TS]
just after treclaim in get_tm_stackpointer() so that we stay in
non-transactional state in case we are preempted. In order to make
treclaim and clearing the thread's MSR[TS] atomic from a preemption
perspective when CONFIG_PREEMPT is set, preempt_disable/enable() is
used. It's also necessary to save the previous value of the thread's
MSR before get_tm_stackpointer() is called so that it can be exposed
to the signal handler later in setup_tm_sigcontexts() to inform the
userspace MSR at the moment of the signal delivery.

Found with tm-signal-context-force-tm kernel selftest.

Fixes: 2b0a576d15e0 ("powerpc: Add new transactional memory state to the signal context")
Cc: stable@vger.kernel.org # v3.9
Signed-off-by: Gustavo Luiz Duarte <gustavold@linux.ibm.com>
Acked-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200211033831.11165-1-gustavold@linux.ibm.com
---
 arch/powerpc/kernel/signal.c    | 17 +++++++++++++++--
 arch/powerpc/kernel/signal_32.c | 28 ++++++++++++++--------------
 arch/powerpc/kernel/signal_64.c | 22 ++++++++++------------
 3 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e6c30cee6abf..d215f9554553 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -200,14 +200,27 @@ unsigned long get_tm_stackpointer(struct task_struct *tsk)
 	 * normal/non-checkpointed stack pointer.
 	 */
 
+	unsigned long ret = tsk->thread.regs->gpr[1];
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	BUG_ON(tsk != current);
 
 	if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
+		preempt_disable();
 		tm_reclaim_current(TM_CAUSE_SIGNAL);
 		if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
-			return tsk->thread.ckpt_regs.gpr[1];
+			ret = tsk->thread.ckpt_regs.gpr[1];
+
+		/*
+		 * If we treclaim, we must clear the current thread's TM bits
+		 * before re-enabling preemption. Otherwise we might be
+		 * preempted and have the live MSR[TS] changed behind our back
+		 * (tm_recheckpoint_new_task() would recheckpoint). Besides, we
+		 * enter the signal handler in non-transactional state.
+		 */
+		tsk->thread.regs->msr &= ~MSR_TS_MASK;
+		preempt_enable();
 	}
 #endif
-	return tsk->thread.regs->gpr[1];
+	return ret;
 }
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 98600b276f76..1b090a76b444 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -489,19 +489,11 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
  */
 static int save_tm_user_regs(struct pt_regs *regs,
 			     struct mcontext __user *frame,
-			     struct mcontext __user *tm_frame, int sigret)
+			     struct mcontext __user *tm_frame, int sigret,
+			     unsigned long msr)
 {
-	unsigned long msr = regs->msr;
-
 	WARN_ON(tm_suspend_disabled);
 
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state.  This also ensures
-	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-
 	/* Save both sets of general registers */
 	if (save_general_regs(&current->thread.ckpt_regs, frame)
 	    || save_general_regs(regs, tm_frame))
@@ -912,6 +904,10 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	int sigret;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -944,13 +940,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_frame = &rt_sf->uc_transact.uc_mcontext;
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		if (__put_user((unsigned long)&rt_sf->uc_transact,
 			       &rt_sf->uc.uc_link) ||
 		    __put_user((unsigned long)tm_frame,
 			       &rt_sf->uc_transact.uc_regs))
 			goto badframe;
-		if (save_tm_user_regs(regs, frame, tm_frame, sigret))
+		if (save_tm_user_regs(regs, frame, tm_frame, sigret, msr))
 			goto badframe;
 	}
 	else
@@ -1369,6 +1365,10 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 	int sigret;
 	unsigned long tramp;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -1402,9 +1402,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	tm_mctx = &frame->mctx_transact;
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
-				      sigret))
+				      sigret, msr))
 			goto badframe;
 	}
 	else
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 117515564ec7..84ed2e77ef9c 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -192,7 +192,8 @@ static long setup_sigcontext(struct sigcontext __user *sc,
 static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 				 struct sigcontext __user *tm_sc,
 				 struct task_struct *tsk,
-				 int signr, sigset_t *set, unsigned long handler)
+				 int signr, sigset_t *set, unsigned long handler,
+				 unsigned long msr)
 {
 	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
 	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
@@ -207,12 +208,11 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
 #endif
 	struct pt_regs *regs = tsk->thread.regs;
-	unsigned long msr = tsk->thread.regs->msr;
 	long err = 0;
 
 	BUG_ON(tsk != current);
 
-	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
+	BUG_ON(!MSR_TM_ACTIVE(msr));
 
 	WARN_ON(tm_suspend_disabled);
 
@@ -222,13 +222,6 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	 */
 	msr |= tsk->thread.ckpt_regs.msr & (MSR_FP | MSR_VEC | MSR_VSX);
 
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state.  This also ensures
-	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-
 #ifdef CONFIG_ALTIVEC
 	err |= __put_user(v_regs, &sc->v_regs);
 	err |= __put_user(tm_v_regs, &tm_sc->v_regs);
@@ -824,6 +817,10 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	unsigned long newsp = 0;
 	long err = 0;
 	struct pt_regs *regs = tsk->thread.regs;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	/* Save the thread's msr before get_tm_stackpointer() changes it */
+	unsigned long msr = regs->msr;
+#endif
 
 	BUG_ON(tsk != current);
 
@@ -841,7 +838,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 	err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __save_altstack(&frame->uc.uc_stack, regs->gpr[1]);
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	if (MSR_TM_ACTIVE(msr)) {
 		/* The ucontext_t passed to userland points to the second
 		 * ucontext_t (for transactional state) with its uc_link ptr.
 		 */
@@ -849,7 +846,8 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
 		err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
 					    &frame->uc_transact.uc_mcontext,
 					    tsk, ksig->sig, NULL,
-					    (unsigned long)ksig->ka.sa.sa_handler);
+					    (unsigned long)ksig->ka.sa.sa_handler,
+					    msr);
 	} else
 #endif
 	{

From 232ca1eecafed8c54491017f0612c33d8c742d74 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Sat, 15 Feb 2020 10:14:25 +0000
Subject: [PATCH 042/239] powerpc/32s: Fix DSI and ISI exceptions for
 CONFIG_VMAP_STACK

hash_page() needs to read page tables from kernel memory. When entire
kernel memory is mapped by BATs, which is normally the case when
CONFIG_STRICT_KERNEL_RWX is not set, it works even if the page hosting
the page table is not referenced in the MMU hash table.

However, if the page where the page table resides is not covered by
a BAT, a DSI fault can be encountered from hash_page(), and it loops
forever. This can happen when CONFIG_STRICT_KERNEL_RWX is selected
and the alignment of the different regions is too small to allow
covering the entire memory with BATs. This also happens when
CONFIG_DEBUG_PAGEALLOC is selected or when booting with 'nobats'
flag.

Also, if the page containing the kernel stack is not present in the
MMU hash table, registers cannot be saved and a recursive DSI fault
is encountered.

To allow hash_page() to properly do its job at all time and load the
MMU hash table whenever needed, it must run with data MMU disabled.
This means it must be called before re-enabling data MMU. To allow
this, registers clobbered by hash_page() and create_hpte() have to
be saved in the thread struct together with SRR0, SSR1, DAR and DSISR.
It is also necessary to ensure that DSI prolog doesn't overwrite
regs saved by prolog of the current running exception. That means:
- DSI can only use SPRN_SPRG_SCRATCH0
- Exceptions must free SPRN_SPRG_SCRATCH0 before writing to the stack.

This also fixes the Oops reported by Erhard when create_hpte() is
called by add_hash_page().

Due to prolog size increase, a few more exceptions had to get split
in two parts.

Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK")
Reported-by: Erhard F. <erhard_f@mailbox.org>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Tested-by: Erhard F. <erhard_f@mailbox.org>
Tested-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206501
Link: https://lore.kernel.org/r/64a4aa44686e9fd4b01333401367029771d9b231.1581761633.git.christophe.leroy@c-s.fr
---
 arch/powerpc/include/asm/processor.h  |   4 +
 arch/powerpc/kernel/asm-offsets.c     |  12 ++
 arch/powerpc/kernel/head_32.S         | 155 +++++++++++++++++++++++++-
 arch/powerpc/kernel/head_32.h         |  21 +++-
 arch/powerpc/mm/book3s32/hash_low.S   |  52 ++++-----
 arch/powerpc/mm/book3s32/mmu.c        |  10 +-
 arch/powerpc/mm/kasan/kasan_init_32.c |   3 +-
 7 files changed, 212 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 8387698bd5b6..eedcbfb9a6ff 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -168,6 +168,10 @@ struct thread_struct {
 	unsigned long	srr1;
 	unsigned long	dar;
 	unsigned long	dsisr;
+#ifdef CONFIG_PPC_BOOK3S_32
+	unsigned long	r0, r3, r4, r5, r6, r8, r9, r11;
+	unsigned long	lr, ctr;
+#endif
 #endif
 	/* Debug Registers */
 	struct debug_reg debug;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index c25e562f1cd9..fcf24a365fc0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -132,6 +132,18 @@ int main(void)
 	OFFSET(SRR1, thread_struct, srr1);
 	OFFSET(DAR, thread_struct, dar);
 	OFFSET(DSISR, thread_struct, dsisr);
+#ifdef CONFIG_PPC_BOOK3S_32
+	OFFSET(THR0, thread_struct, r0);
+	OFFSET(THR3, thread_struct, r3);
+	OFFSET(THR4, thread_struct, r4);
+	OFFSET(THR5, thread_struct, r5);
+	OFFSET(THR6, thread_struct, r6);
+	OFFSET(THR8, thread_struct, r8);
+	OFFSET(THR9, thread_struct, r9);
+	OFFSET(THR11, thread_struct, r11);
+	OFFSET(THLR, thread_struct, lr);
+	OFFSET(THCTR, thread_struct, ctr);
+#endif
 #endif
 #ifdef CONFIG_SPE
 	OFFSET(THREAD_EVR0, thread_struct, evr[0]);
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 0493fcac6409..97c887950c3c 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -290,17 +290,55 @@ MachineCheck:
 7:	EXCEPTION_PROLOG_2
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_CHRP
-	bne	cr1,1f
+#ifdef CONFIG_VMAP_STACK
+	mfspr	r4, SPRN_SPRG_THREAD
+	tovirt(r4, r4)
+	lwz	r4, RTAS_SP(r4)
+	cmpwi	cr1, r4, 0
 #endif
-	EXC_XFER_STD(0x200, machine_check_exception)
-#ifdef CONFIG_PPC_CHRP
-1:	b	machine_check_in_rtas
+	beq	cr1, machine_check_tramp
+	b	machine_check_in_rtas
+#else
+	b	machine_check_tramp
 #endif
 
 /* Data access exception. */
 	. = 0x300
 	DO_KVM  0x300
 DataAccess:
+#ifdef CONFIG_VMAP_STACK
+	mtspr	SPRN_SPRG_SCRATCH0,r10
+	mfspr	r10, SPRN_SPRG_THREAD
+BEGIN_MMU_FTR_SECTION
+	stw	r11, THR11(r10)
+	mfspr	r10, SPRN_DSISR
+	mfcr	r11
+#ifdef CONFIG_PPC_KUAP
+	andis.	r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH | DSISR_PROTFAULT)@h
+#else
+	andis.	r10, r10, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h
+#endif
+	mfspr	r10, SPRN_SPRG_THREAD
+	beq	hash_page_dsi
+.Lhash_page_dsi_cont:
+	mtcr	r11
+	lwz	r11, THR11(r10)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+	mtspr	SPRN_SPRG_SCRATCH1,r11
+	mfspr	r11, SPRN_DAR
+	stw	r11, DAR(r10)
+	mfspr	r11, SPRN_DSISR
+	stw	r11, DSISR(r10)
+	mfspr	r11, SPRN_SRR0
+	stw	r11, SRR0(r10)
+	mfspr	r11, SPRN_SRR1		/* check whether user or kernel */
+	stw	r11, SRR1(r10)
+	mfcr	r10
+	andi.	r11, r11, MSR_PR
+
+	EXCEPTION_PROLOG_1
+	b	handle_page_fault_tramp_1
+#else	/* CONFIG_VMAP_STACK */
 	EXCEPTION_PROLOG handle_dar_dsisr=1
 	get_and_save_dar_dsisr_on_stack	r4, r5, r11
 BEGIN_MMU_FTR_SECTION
@@ -316,11 +354,32 @@ BEGIN_MMU_FTR_SECTION
 FTR_SECTION_ELSE
 	b	handle_page_fault_tramp_2
 ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
+#endif	/* CONFIG_VMAP_STACK */
 
 /* Instruction access exception. */
 	. = 0x400
 	DO_KVM  0x400
 InstructionAccess:
+#ifdef CONFIG_VMAP_STACK
+	mtspr	SPRN_SPRG_SCRATCH0,r10
+	mtspr	SPRN_SPRG_SCRATCH1,r11
+	mfspr	r10, SPRN_SPRG_THREAD
+	mfspr	r11, SPRN_SRR0
+	stw	r11, SRR0(r10)
+	mfspr	r11, SPRN_SRR1		/* check whether user or kernel */
+	stw	r11, SRR1(r10)
+	mfcr	r10
+BEGIN_MMU_FTR_SECTION
+	andis.	r11, r11, SRR1_ISI_NOPT@h	/* no pte found? */
+	bne	hash_page_isi
+.Lhash_page_isi_cont:
+	mfspr	r11, SPRN_SRR1		/* check whether user or kernel */
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+	andi.	r11, r11, MSR_PR
+
+	EXCEPTION_PROLOG_1
+	EXCEPTION_PROLOG_2
+#else	/* CONFIG_VMAP_STACK */
 	EXCEPTION_PROLOG
 	andis.	r0,r9,SRR1_ISI_NOPT@h	/* no pte found? */
 	beq	1f			/* if so, try to put a PTE */
@@ -329,6 +388,7 @@ InstructionAccess:
 BEGIN_MMU_FTR_SECTION
 	bl	hash_page
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif	/* CONFIG_VMAP_STACK */
 1:	mr	r4,r12
 	andis.	r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
 	stw	r4, _DAR(r11)
@@ -344,7 +404,7 @@ Alignment:
 	EXCEPTION_PROLOG handle_dar_dsisr=1
 	save_dar_dsisr_on_stack r4, r5, r11
 	addi	r3,r1,STACK_FRAME_OVERHEAD
-	EXC_XFER_STD(0x600, alignment_exception)
+	b	alignment_exception_tramp
 
 /* Program check exception */
 	EXCEPTION(0x700, ProgramCheck, program_check_exception, EXC_XFER_STD)
@@ -645,15 +705,100 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
 
 	. = 0x3000
 
+machine_check_tramp:
+	EXC_XFER_STD(0x200, machine_check_exception)
+
+alignment_exception_tramp:
+	EXC_XFER_STD(0x600, alignment_exception)
+
 handle_page_fault_tramp_1:
+#ifdef CONFIG_VMAP_STACK
+	EXCEPTION_PROLOG_2 handle_dar_dsisr=1
+#endif
 	lwz	r4, _DAR(r11)
 	lwz	r5, _DSISR(r11)
 	/* fall through */
 handle_page_fault_tramp_2:
 	EXC_XFER_LITE(0x300, handle_page_fault)
 
+#ifdef CONFIG_VMAP_STACK
+.macro save_regs_thread		thread
+	stw	r0, THR0(\thread)
+	stw	r3, THR3(\thread)
+	stw	r4, THR4(\thread)
+	stw	r5, THR5(\thread)
+	stw	r6, THR6(\thread)
+	stw	r8, THR8(\thread)
+	stw	r9, THR9(\thread)
+	mflr	r0
+	stw	r0, THLR(\thread)
+	mfctr	r0
+	stw	r0, THCTR(\thread)
+.endm
+
+.macro restore_regs_thread	thread
+	lwz	r0, THLR(\thread)
+	mtlr	r0
+	lwz	r0, THCTR(\thread)
+	mtctr	r0
+	lwz	r0, THR0(\thread)
+	lwz	r3, THR3(\thread)
+	lwz	r4, THR4(\thread)
+	lwz	r5, THR5(\thread)
+	lwz	r6, THR6(\thread)
+	lwz	r8, THR8(\thread)
+	lwz	r9, THR9(\thread)
+.endm
+
+hash_page_dsi:
+	save_regs_thread	r10
+	mfdsisr	r3
+	mfdar	r4
+	mfsrr0	r5
+	mfsrr1	r9
+	rlwinm	r3, r3, 32 - 15, _PAGE_RW	/* DSISR_STORE -> _PAGE_RW */
+	bl	hash_page
+	mfspr	r10, SPRN_SPRG_THREAD
+	restore_regs_thread r10
+	b	.Lhash_page_dsi_cont
+
+hash_page_isi:
+	mr	r11, r10
+	mfspr	r10, SPRN_SPRG_THREAD
+	save_regs_thread	r10
+	li	r3, 0
+	lwz	r4, SRR0(r10)
+	lwz	r9, SRR1(r10)
+	bl	hash_page
+	mfspr	r10, SPRN_SPRG_THREAD
+	restore_regs_thread r10
+	mr	r10, r11
+	b	.Lhash_page_isi_cont
+
+	.globl fast_hash_page_return
+fast_hash_page_return:
+	andis.	r10, r9, SRR1_ISI_NOPT@h	/* Set on ISI, cleared on DSI */
+	mfspr	r10, SPRN_SPRG_THREAD
+	restore_regs_thread r10
+	bne	1f
+
+	/* DSI */
+	mtcr	r11
+	lwz	r11, THR11(r10)
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	SYNC
+	RFI
+
+1:	/* ISI */
+	mtcr	r11
+	mfspr	r11, SPRN_SPRG_SCRATCH1
+	mfspr	r10, SPRN_SPRG_SCRATCH0
+	SYNC
+	RFI
+
 stack_overflow:
 	vmap_stack_overflow_exception
+#endif
 
 AltiVecUnavailable:
 	EXCEPTION_PROLOG
diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h
index a6a5fbbf8504..9db162f79fe6 100644
--- a/arch/powerpc/kernel/head_32.h
+++ b/arch/powerpc/kernel/head_32.h
@@ -64,11 +64,25 @@
 .endm
 
 .macro EXCEPTION_PROLOG_2 handle_dar_dsisr=0
+#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
+BEGIN_MMU_FTR_SECTION
+	mtcr	r10
+FTR_SECTION_ELSE
+	stw	r10, _CCR(r11)
+ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE)
+#else
 	stw	r10,_CCR(r11)		/* save registers */
+#endif
+	mfspr	r10, SPRN_SPRG_SCRATCH0
 	stw	r12,GPR12(r11)
 	stw	r9,GPR9(r11)
-	mfspr	r10,SPRN_SPRG_SCRATCH0
 	stw	r10,GPR10(r11)
+#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
+BEGIN_MMU_FTR_SECTION
+	mfcr	r10
+	stw	r10, _CCR(r11)
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 	mfspr	r12,SPRN_SPRG_SCRATCH1
 	stw	r12,GPR11(r11)
 	mflr	r10
@@ -83,6 +97,11 @@
 	stw	r10, _DSISR(r11)
 	.endif
 	lwz	r9, SRR1(r12)
+#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_PPC_BOOK3S)
+BEGIN_MMU_FTR_SECTION
+	andi.	r10, r9, MSR_PR
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_HPTE_TABLE)
+#endif
 	lwz	r12, SRR0(r12)
 #else
 	mfspr	r12,SPRN_SRR0
diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S
index c11b0a005196..2015c4f96238 100644
--- a/arch/powerpc/mm/book3s32/hash_low.S
+++ b/arch/powerpc/mm/book3s32/hash_low.S
@@ -25,12 +25,6 @@
 #include <asm/feature-fixups.h>
 #include <asm/code-patching-asm.h>
 
-#ifdef CONFIG_VMAP_STACK
-#define ADDR_OFFSET	0
-#else
-#define ADDR_OFFSET	PAGE_OFFSET
-#endif
-
 #ifdef CONFIG_SMP
 	.section .bss
 	.align	2
@@ -53,8 +47,8 @@ mmu_hash_lock:
 	.text
 _GLOBAL(hash_page)
 #ifdef CONFIG_SMP
-	lis	r8, (mmu_hash_lock - ADDR_OFFSET)@h
-	ori	r8, r8, (mmu_hash_lock - ADDR_OFFSET)@l
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@h
+	ori	r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l
 	lis	r0,0x0fff
 	b	10f
 11:	lwz	r6,0(r8)
@@ -72,12 +66,9 @@ _GLOBAL(hash_page)
 	cmplw	0,r4,r0
 	ori	r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */
 	mfspr	r5, SPRN_SPRG_PGDIR	/* phys page-table root */
-#ifdef CONFIG_VMAP_STACK
-	tovirt(r5, r5)
-#endif
 	blt+	112f			/* assume user more likely */
-	lis	r5, (swapper_pg_dir - ADDR_OFFSET)@ha	/* if kernel address, use */
-	addi	r5 ,r5 ,(swapper_pg_dir - ADDR_OFFSET)@l	/* kernel page table */
+	lis	r5, (swapper_pg_dir - PAGE_OFFSET)@ha	/* if kernel address, use */
+	addi	r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l	/* kernel page table */
 	rlwimi	r3,r9,32-12,29,29	/* MSR_PR -> _PAGE_USER */
 112:
 #ifndef CONFIG_PTE_64BIT
@@ -89,9 +80,6 @@ _GLOBAL(hash_page)
 	lwzx	r8,r8,r5		/* Get L1 entry */
 	rlwinm.	r8,r8,0,0,20		/* extract pt base address */
 #endif
-#ifdef CONFIG_VMAP_STACK
-	tovirt(r8, r8)
-#endif
 #ifdef CONFIG_SMP
 	beq-	hash_page_out		/* return if no mapping */
 #else
@@ -143,30 +131,36 @@ retry:
 	bne-	retry			/* retry if someone got there first */
 
 	mfsrin	r3,r4			/* get segment reg for segment */
+#ifndef CONFIG_VMAP_STACK
 	mfctr	r0
 	stw	r0,_CTR(r11)
+#endif
 	bl	create_hpte		/* add the hash table entry */
 
 #ifdef CONFIG_SMP
 	eieio
-	lis	r8, (mmu_hash_lock - ADDR_OFFSET)@ha
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
 	li	r0,0
-	stw	r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8)
+	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+	b	fast_hash_page_return
+#else
 	/* Return from the exception */
 	lwz	r5,_CTR(r11)
 	mtctr	r5
 	lwz	r0,GPR0(r11)
 	lwz	r8,GPR8(r11)
 	b	fast_exception_return
+#endif
 
 #ifdef CONFIG_SMP
 hash_page_out:
 	eieio
-	lis	r8, (mmu_hash_lock - ADDR_OFFSET)@ha
+	lis	r8, (mmu_hash_lock - PAGE_OFFSET)@ha
 	li	r0,0
-	stw	r0, (mmu_hash_lock - ADDR_OFFSET)@l(r8)
+	stw	r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8)
 	blr
 #endif /* CONFIG_SMP */
 
@@ -341,7 +335,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 	patch_site	1f, patch__hash_page_A1
 	patch_site	2f, patch__hash_page_A2
 	/* Get the address of the primary PTE group in the hash table (r3) */
-0:	lis	r0, (Hash_base - ADDR_OFFSET)@h	/* base address of hash table */
+0:	lis	r0, (Hash_base - PAGE_OFFSET)@h	/* base address of hash table */
 1:	rlwimi	r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT    /* VSID -> hash */
 2:	rlwinm	r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */
 	xor	r3,r3,r0		/* make primary hash */
@@ -355,10 +349,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 	beq+	10f			/* no PTE: go look for an empty slot */
 	tlbie	r4
 
-	lis	r4, (htab_hash_searches - ADDR_OFFSET)@ha
-	lwz	r6, (htab_hash_searches - ADDR_OFFSET)@l(r4)
+	lis	r4, (htab_hash_searches - PAGE_OFFSET)@ha
+	lwz	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
 	addi	r6,r6,1			/* count how many searches we do */
-	stw	r6, (htab_hash_searches - ADDR_OFFSET)@l(r4)
+	stw	r6, (htab_hash_searches - PAGE_OFFSET)@l(r4)
 
 	/* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */
 	mtctr	r0
@@ -390,10 +384,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 	beq+	found_empty
 
 	/* update counter of times that the primary PTEG is full */
-	lis	r4, (primary_pteg_full - ADDR_OFFSET)@ha
-	lwz	r6, (primary_pteg_full - ADDR_OFFSET)@l(r4)
+	lis	r4, (primary_pteg_full - PAGE_OFFSET)@ha
+	lwz	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
 	addi	r6,r6,1
-	stw	r6, (primary_pteg_full - ADDR_OFFSET)@l(r4)
+	stw	r6, (primary_pteg_full - PAGE_OFFSET)@l(r4)
 
 	patch_site	0f, patch__hash_page_C
 	/* Search the secondary PTEG for an empty slot */
@@ -427,8 +421,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT)
 	 * lockup here but that shouldn't happen
 	 */
 
-1:	lis	r4, (next_slot - ADDR_OFFSET)@ha	/* get next evict slot */
-	lwz	r6, (next_slot - ADDR_OFFSET)@l(r4)
+1:	lis	r4, (next_slot - PAGE_OFFSET)@ha	/* get next evict slot */
+	lwz	r6, (next_slot - PAGE_OFFSET)@l(r4)
 	addi	r6,r6,HPTE_SIZE			/* search for candidate */
 	andi.	r6,r6,7*HPTE_SIZE
 	stw	r6,next_slot@l(r4)
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 0a1c65a2c565..f888cbb109b9 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -413,7 +413,7 @@ void __init MMU_init_hw(void)
 void __init MMU_init_hw_patch(void)
 {
 	unsigned int hmask = Hash_mask >> (16 - LG_HPTEG_SIZE);
-	unsigned int hash;
+	unsigned int hash = (unsigned int)Hash - PAGE_OFFSET;
 
 	if (ppc_md.progress)
 		ppc_md.progress("hash:patch", 0x345);
@@ -425,11 +425,6 @@ void __init MMU_init_hw_patch(void)
 	/*
 	 * Patch up the instructions in hashtable.S:create_hpte
 	 */
-	if (IS_ENABLED(CONFIG_VMAP_STACK))
-		hash = (unsigned int)Hash;
-	else
-		hash = (unsigned int)Hash - PAGE_OFFSET;
-
 	modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16);
 	modify_instruction_site(&patch__hash_page_A1, 0x7c0, hash_mb << 6);
 	modify_instruction_site(&patch__hash_page_A2, 0x7c0, hash_mb2 << 6);
@@ -439,8 +434,7 @@ void __init MMU_init_hw_patch(void)
 	/*
 	 * Patch up the instructions in hashtable.S:flush_hash_page
 	 */
-	modify_instruction_site(&patch__flush_hash_A0, 0xffff,
-				((unsigned int)Hash - PAGE_OFFSET) >> 16);
+	modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16);
 	modify_instruction_site(&patch__flush_hash_A1, 0x7c0, hash_mb << 6);
 	modify_instruction_site(&patch__flush_hash_A2, 0x7c0, hash_mb2 << 6);
 	modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask);
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 16dd95bd0749..db5664dde5ff 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -185,8 +185,7 @@ u8 __initdata early_hash[256 << 10] __aligned(256 << 10) = {0};
 
 static void __init kasan_early_hash_table(void)
 {
-	unsigned int hash = IS_ENABLED(CONFIG_VMAP_STACK) ? (unsigned int)early_hash :
-							    __pa(early_hash);
+	unsigned int hash = __pa(early_hash);
 
 	modify_instruction_site(&patch__hash_page_A0, 0xffff, hash >> 16);
 	modify_instruction_site(&patch__flush_hash_A0, 0xffff, hash >> 16);

From 5a528eb67908bcf493f3583cb4726fbd8e129605 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 14 Feb 2020 08:39:50 +0000
Subject: [PATCH 043/239] powerpc/chrp: Fix enter_rtas() with CONFIG_VMAP_STACK

With CONFIG_VMAP_STACK, data MMU has to be enabled
to read data on the stack.

Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/d2330584f8c42d3039896e2b56f5d39676dc919c.1581669558.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/entry_32.S | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 0713daa651d9..bc056d906b51 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1354,12 +1354,17 @@ _GLOBAL(enter_rtas)
 	mtspr	SPRN_SRR0,r8
 	mtspr	SPRN_SRR1,r9
 	RFI
-1:	tophys(r9,r1)
+1:	tophys_novmstack r9, r1
+#ifdef CONFIG_VMAP_STACK
+	li	r0, MSR_KERNEL & ~MSR_IR	/* can take DTLB miss */
+	mtmsr	r0
+	isync
+#endif
 	lwz	r8,INT_FRAME_SIZE+4(r9)	/* get return address */
 	lwz	r9,8(r9)	/* original msr value */
 	addi	r1,r1,INT_FRAME_SIZE
 	li	r0,0
-	tophys(r7, r2)
+	tophys_novmstack r7, r2
 	stw	r0, THREAD + RTAS_SP(r7)
 	mtspr	SPRN_SRR0,r8
 	mtspr	SPRN_SRR1,r9

From 477f3488a94e35380c82a7498d46f10fa5f3edd2 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Fri, 14 Feb 2020 06:53:00 +0000
Subject: [PATCH 044/239] powerpc/6xx: Fix power_save_ppc32_restore() with
 CONFIG_VMAP_STACK

power_save_ppc32_restore() is called during exception entry, before
re-enabling the MMU. It substracts KERNELBASE from the address
of nap_save_msscr0 to access it.

With CONFIG_VMAP_STACK enabled, data MMU translation has already been
re-enabled, so power_save_ppc32_restore() has to access
nap_save_msscr0 by its virtual address.

Reported-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Fixes: cd08f109e262 ("powerpc/32s: Enable CONFIG_VMAP_STACK")
Tested-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/7bce32ccbab3ba3e3e0f27da6961bf6313df97ed.1581663140.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/idle_6xx.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S
index 0ffdd18b9f26..433d97bea1f3 100644
--- a/arch/powerpc/kernel/idle_6xx.S
+++ b/arch/powerpc/kernel/idle_6xx.S
@@ -166,7 +166,11 @@ BEGIN_FTR_SECTION
 	mfspr	r9,SPRN_HID0
 	andis.	r9,r9,HID0_NAP@h
 	beq	1f
+#ifdef CONFIG_VMAP_STACK
+	addis	r9, r11, nap_save_msscr0@ha
+#else
 	addis	r9,r11,(nap_save_msscr0-KERNELBASE)@ha
+#endif
 	lwz	r9,nap_save_msscr0@l(r9)
 	mtspr	SPRN_MSSCR0, r9
 	sync
@@ -174,7 +178,11 @@ BEGIN_FTR_SECTION
 1:
 END_FTR_SECTION_IFSET(CPU_FTR_NAP_DISABLE_L2_PR)
 BEGIN_FTR_SECTION
+#ifdef CONFIG_VMAP_STACK
+	addis	r9, r11, nap_save_hid1@ha
+#else
 	addis	r9,r11,(nap_save_hid1-KERNELBASE)@ha
+#endif
 	lwz	r9,nap_save_hid1@l(r9)
 	mtspr	SPRN_HID1, r9
 END_FTR_SECTION_IFSET(CPU_FTR_DUAL_PLL_750FX)

From 066bc3576e653b615ee3f5230a89d69c8ebeeb71 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Mon, 17 Feb 2020 15:13:43 +1100
Subject: [PATCH 045/239] powerpc/xmon: Fix whitespace handling in getstring()

The ls (lookup symbol) and zr (reboot) commands use xmon's getstring()
helper to read a string argument from the xmon prompt. This function
skips over leading whitespace, but doesn't check if the first
"non-whitespace" character is a newline which causes some odd
behaviour (<enter> indicates a the enter key was pressed):

  0:mon> ls printk<enter>
  printk: c0000000001680c4

  0:mon> ls<enter>
  printk<enter>
  Symbol '
  printk' not found.
  0:mon>

With commit 2d9b332d99b ("powerpc/xmon: Allow passing an argument to
ppc_md.restart()") we have a similar problem with the zr command.
Previously zr took no arguments so "zr<enter> would trigger a reboot.
With that patch applied a second newline needs to be sent in order for
the reboot to occur. Fix this by checking if the leading whitespace
ended on a newline:

  0:mon> ls<enter>
  Symbol '' not found.

Fixes: 2d9b332d99b2 ("powerpc/xmon: Allow passing an argument to ppc_md.restart()")
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200217041343.2454-1-oohall@gmail.com
---
 arch/powerpc/xmon/xmon.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index e8c84d265602..0ec9640335bb 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -3435,6 +3435,11 @@ getstring(char *s, int size)
 	int c;
 
 	c = skipbl();
+	if (c == '\n') {
+		*s = 0;
+		return;
+	}
+
 	do {
 		if( size > 1 ){
 			*s++ = c;

From 789a2c250340666220fa74bc6c8f58497e3863b3 Mon Sep 17 00:00:00 2001
From: Hanno Zulla <kontakt@hanno.de>
Date: Tue, 18 Feb 2020 12:37:47 +0100
Subject: [PATCH 046/239] HID: hid-bigbenff: fix general protection fault
 caused by double kfree

The struct *bigben was allocated via devm_kzalloc() and then used as a
parameter in input_ff_create_memless(). This caused a double kfree
during removal of the device, since both the managed resource API and
ml_ff_destroy() in drivers/input/ff-memless.c would call kfree() on it.

Signed-off-by: Hanno Zulla <kontakt@hanno.de>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-bigbenff.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/hid/hid-bigbenff.c b/drivers/hid/hid-bigbenff.c
index 3f6abd190df4..f7e85bacb688 100644
--- a/drivers/hid/hid-bigbenff.c
+++ b/drivers/hid/hid-bigbenff.c
@@ -220,10 +220,16 @@ static void bigben_worker(struct work_struct *work)
 static int hid_bigben_play_effect(struct input_dev *dev, void *data,
 			 struct ff_effect *effect)
 {
-	struct bigben_device *bigben = data;
+	struct hid_device *hid = input_get_drvdata(dev);
+	struct bigben_device *bigben = hid_get_drvdata(hid);
 	u8 right_motor_on;
 	u8 left_motor_force;
 
+	if (!bigben) {
+		hid_err(hid, "no device data\n");
+		return 0;
+	}
+
 	if (effect->type != FF_RUMBLE)
 		return 0;
 
@@ -341,7 +347,7 @@ static int bigben_probe(struct hid_device *hid,
 
 	INIT_WORK(&bigben->worker, bigben_worker);
 
-	error = input_ff_create_memless(hidinput->input, bigben,
+	error = input_ff_create_memless(hidinput->input, NULL,
 		hid_bigben_play_effect);
 	if (error)
 		return error;

From 976a54d0f4202cb412a3b1fc7f117e1d97db35f3 Mon Sep 17 00:00:00 2001
From: Hanno Zulla <kontakt@hanno.de>
Date: Tue, 18 Feb 2020 12:38:34 +0100
Subject: [PATCH 047/239] HID: hid-bigbenff: call hid_hw_stop() in case of
 error

It's required to call hid_hw_stop() once hid_hw_start() was called
previously, so error cases need to handle this. Also, hid_hw_close() is
not necessary during removal.

Signed-off-by: Hanno Zulla <kontakt@hanno.de>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-bigbenff.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/hid/hid-bigbenff.c b/drivers/hid/hid-bigbenff.c
index f7e85bacb688..f8c552b64a89 100644
--- a/drivers/hid/hid-bigbenff.c
+++ b/drivers/hid/hid-bigbenff.c
@@ -305,7 +305,6 @@ static void bigben_remove(struct hid_device *hid)
 	struct bigben_device *bigben = hid_get_drvdata(hid);
 
 	cancel_work_sync(&bigben->worker);
-	hid_hw_close(hid);
 	hid_hw_stop(hid);
 }
 
@@ -350,7 +349,7 @@ static int bigben_probe(struct hid_device *hid,
 	error = input_ff_create_memless(hidinput->input, NULL,
 		hid_bigben_play_effect);
 	if (error)
-		return error;
+		goto error_hw_stop;
 
 	name_sz = strlen(dev_name(&hid->dev)) + strlen(":red:bigben#") + 1;
 
@@ -360,8 +359,10 @@ static int bigben_probe(struct hid_device *hid,
 			sizeof(struct led_classdev) + name_sz,
 			GFP_KERNEL
 		);
-		if (!led)
-			return -ENOMEM;
+		if (!led) {
+			error = -ENOMEM;
+			goto error_hw_stop;
+		}
 		name = (void *)(&led[1]);
 		snprintf(name, name_sz,
 			"%s:red:bigben%d",
@@ -375,7 +376,7 @@ static int bigben_probe(struct hid_device *hid,
 		bigben->leds[n] = led;
 		error = devm_led_classdev_register(&hid->dev, led);
 		if (error)
-			return error;
+			goto error_hw_stop;
 	}
 
 	/* initial state: LED1 is on, no rumble effect */
@@ -389,6 +390,10 @@ static int bigben_probe(struct hid_device *hid,
 	hid_info(hid, "LED and force feedback support for BigBen gamepad\n");
 
 	return 0;
+
+error_hw_stop:
+	hid_hw_stop(hid);
+	return error;
 }
 
 static __u8 *bigben_report_fixup(struct hid_device *hid, __u8 *rdesc,

From 4eb1b01de5b9d8596d6c103efcf1a15cfc1bedf7 Mon Sep 17 00:00:00 2001
From: Hanno Zulla <kontakt@hanno.de>
Date: Tue, 18 Feb 2020 12:39:31 +0100
Subject: [PATCH 048/239] HID: hid-bigbenff: fix race condition for scheduled
 work during removal

It's possible that there is scheduled work left while the device is
already being removed, which can cause a kernel crash. Adding a flag
will avoid this.

Signed-off-by: Hanno Zulla <kontakt@hanno.de>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
---
 drivers/hid/hid-bigbenff.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/hid/hid-bigbenff.c b/drivers/hid/hid-bigbenff.c
index f8c552b64a89..db6da21ade06 100644
--- a/drivers/hid/hid-bigbenff.c
+++ b/drivers/hid/hid-bigbenff.c
@@ -174,6 +174,7 @@ static __u8 pid0902_rdesc_fixed[] = {
 struct bigben_device {
 	struct hid_device *hid;
 	struct hid_report *report;
+	bool removed;
 	u8 led_state;         /* LED1 = 1 .. LED4 = 8 */
 	u8 right_motor_on;    /* right motor off/on 0/1 */
 	u8 left_motor_force;  /* left motor force 0-255 */
@@ -190,6 +191,9 @@ static void bigben_worker(struct work_struct *work)
 		struct bigben_device, worker);
 	struct hid_field *report_field = bigben->report->field[0];
 
+	if (bigben->removed)
+		return;
+
 	if (bigben->work_led) {
 		bigben->work_led = false;
 		report_field->value[0] = 0x01; /* 1 = led message */
@@ -304,6 +308,7 @@ static void bigben_remove(struct hid_device *hid)
 {
 	struct bigben_device *bigben = hid_get_drvdata(hid);
 
+	bigben->removed = true;
 	cancel_work_sync(&bigben->worker);
 	hid_hw_stop(hid);
 }
@@ -324,6 +329,7 @@ static int bigben_probe(struct hid_device *hid,
 		return -ENOMEM;
 	hid_set_drvdata(hid, bigben);
 	bigben->hid = hid;
+	bigben->removed = false;
 
 	error = hid_parse(hid);
 	if (error) {

From c68a9032299e837b56d356de9250c93094f7e0e3 Mon Sep 17 00:00:00 2001
From: Greentime Hu <greentime.hu@sifive.com>
Date: Thu, 9 Jan 2020 11:17:40 +0800
Subject: [PATCH 049/239] riscv: set pmp configuration if kernel is running in
 M-mode

When the kernel is running in S-mode, the expectation is that the
bootloader or SBI layer will configure the PMP to allow the kernel to
access physical memory.  But, when the kernel is running in M-mode and is
started with the ELF "loader", there's probably no bootloader or SBI layer
involved to configure the PMP.  Thus, we need to configure the PMP
ourselves to enable the kernel to access all regions.

Signed-off-by: Greentime Hu <greentime.hu@sifive.com>
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/include/asm/csr.h | 12 ++++++++++++
 arch/riscv/kernel/head.S     |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 435b65532e29..8e18d2c64399 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -72,6 +72,16 @@
 #define EXC_LOAD_PAGE_FAULT	13
 #define EXC_STORE_PAGE_FAULT	15
 
+/* PMP configuration */
+#define PMP_R			0x01
+#define PMP_W			0x02
+#define PMP_X			0x04
+#define PMP_A			0x18
+#define PMP_A_TOR		0x08
+#define PMP_A_NA4		0x10
+#define PMP_A_NAPOT		0x18
+#define PMP_L			0x80
+
 /* symbolic CSR names: */
 #define CSR_CYCLE		0xc00
 #define CSR_TIME		0xc01
@@ -100,6 +110,8 @@
 #define CSR_MCAUSE		0x342
 #define CSR_MTVAL		0x343
 #define CSR_MIP			0x344
+#define CSR_PMPCFG0		0x3a0
+#define CSR_PMPADDR0		0x3b0
 #define CSR_MHARTID		0xf14
 
 #ifdef CONFIG_RISCV_M_MODE
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 271860fc2c3f..85f2073e7fe4 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -58,6 +58,12 @@ _start_kernel:
 	/* Reset all registers except ra, a0, a1 */
 	call reset_regs
 
+	/* Setup a PMP to permit access to all of memory. */
+	li a0, -1
+	csrw CSR_PMPADDR0, a0
+	li a0, (PMP_A_NAPOT | PMP_R | PMP_W | PMP_X)
+	csrw CSR_PMPCFG0, a0
+
 	/*
 	 * The hartid in a0 is expected later on, and we have no firmware
 	 * to hand it to us.

From dd1f6308b28edf0452dd5dc7877992903ec61e69 Mon Sep 17 00:00:00 2001
From: Vincenzo Frascino <vincenzo.frascino@arm.com>
Date: Tue, 18 Feb 2020 16:49:06 +0000
Subject: [PATCH 050/239] arm64: lse: Fix LSE atomics with LLVM

Commit e0d5896bd356 ("arm64: lse: fix LSE atomics with LLVM's integrated
assembler") broke the build when clang is used in connjunction with the
binutils assembler ("-no-integrated-as"). This happens because
__LSE_PREAMBLE is defined as ".arch armv8-a+lse", which overrides the
version of the CPU architecture passed via the "-march" paramter to gas:

$ aarch64-none-linux-gnu-as -EL -I ./arch/arm64/include
                                -I ./arch/arm64/include/generated
                                -I ./include -I ./include
                                -I ./arch/arm64/include/uapi
                                -I ./arch/arm64/include/generated/uapi
                                -I ./include/uapi -I ./include/generated/uapi
                                -I ./init -I ./init
                                -march=armv8.3-a -o init/do_mounts.o
                                /tmp/do_mounts-d7992a.s
/tmp/do_mounts-d7992a.s: Assembler messages:
/tmp/do_mounts-d7992a.s:1959: Error: selected processor does not support `autiasp'
/tmp/do_mounts-d7992a.s:2021: Error: selected processor does not support `paciasp'
/tmp/do_mounts-d7992a.s:2157: Error: selected processor does not support `autiasp'
/tmp/do_mounts-d7992a.s:2175: Error: selected processor does not support `paciasp'
/tmp/do_mounts-d7992a.s:2494: Error: selected processor does not support `autiasp'

Fix the issue by replacing ".arch armv8-a+lse" with ".arch_extension lse".
Sami confirms that the clang integrated assembler does now support the
'.arch_extension' directive, so this change will be fine even for LTO
builds in future.

Fixes: e0d5896bd356cd ("arm64: lse: fix LSE atomics with LLVM's integrated assembler")
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Reported-by: Amit Kachhap <Amit.Kachhap@arm.com>
Tested-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/lse.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index d429f7701c36..5d10051c3e62 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -6,7 +6,7 @@
 
 #ifdef CONFIG_ARM64_LSE_ATOMICS
 
-#define __LSE_PREAMBLE	".arch armv8-a+lse\n"
+#define __LSE_PREAMBLE	".arch_extension lse\n"
 
 #include <linux/compiler_types.h>
 #include <linux/export.h>

From 297a31e3e8318f533cff4fe33ffaefb74f72c6e2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 17 Feb 2020 17:39:45 +0300
Subject: [PATCH 051/239] io_uring: remove unnecessary NULL checks

The "kmsg" pointer can't be NULL and we have already dereferenced it so
a check here would be useless.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 29565d82291f..d35b45696c73 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3075,7 +3075,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 			if (req->io)
 				return -EAGAIN;
 			if (io_alloc_async_ctx(req)) {
-				if (kmsg && kmsg->iov != kmsg->fast_iov)
+				if (kmsg->iov != kmsg->fast_iov)
 					kfree(kmsg->iov);
 				return -ENOMEM;
 			}
@@ -3229,7 +3229,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
 			if (req->io)
 				return -EAGAIN;
 			if (io_alloc_async_ctx(req)) {
-				if (kmsg && kmsg->iov != kmsg->fast_iov)
+				if (kmsg->iov != kmsg->fast_iov)
 					kfree(kmsg->iov);
 				return -ENOMEM;
 			}

From 6a1ce99dc4bde564e4a072936f9d41f4a439140e Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@wdc.com>
Date: Sun, 2 Feb 2020 16:32:02 +0530
Subject: [PATCH 052/239] RISC-V: Don't enable all interrupts in trap_init()

Historically, we have been enabling all interrupts for each
HART in trap_init(). Ideally, we should only enable M-mode
interrupts for M-mode kernel and S-mode interrupts for S-mode
kernel in trap_init().

Currently, we get suprious S-mode interrupts on Kendryte K210
board running M-mode NO-MMU kernel because we are enabling all
interrupts in trap_init(). To fix this, we only enable software
and external interrupt in trap_init(). In future, trap_init()
will only enable software interrupt and PLIC driver will enable
external interrupt using CPU notifiers.

Fixes: a4c3733d32a7 ("riscv: abstract out CSR names for supervisor vs machine mode")
Signed-off-by: Anup Patel <anup.patel@wdc.com>
Reviewed-by: Atish Patra <atish.patra@wdc.com>
Tested-by: Palmer Dabbelt <palmerdabbelt@google.com> [QMEU virt machine with SMP]
[Palmer: Move the Fixes up to a newer commit]
Reviewed-by: Palmer Dabbelt <palmerdabbelt@google.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index f4cad5163bf2..ffb3d94bf0cc 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -156,6 +156,6 @@ void __init trap_init(void)
 	csr_write(CSR_SCRATCH, 0);
 	/* Set the exception vector address */
 	csr_write(CSR_TVEC, &handle_exception);
-	/* Enable all interrupts */
-	csr_write(CSR_IE, -1);
+	/* Enable interrupts */
+	csr_write(CSR_IE, IE_SIE | IE_EIE);
 }

From bf4498ad3f9a0f7202cf90e52b5ce9bb31700b91 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 17 Feb 2020 20:04:19 -0800
Subject: [PATCH 053/239] tmpfs: deny and force are not huge mount options

5.6-rc1 commit 2710c957a8ef ("fs_parse: get rid of ->enums") regressed
the huge tmpfs mount options to an earlier state: "deny" and "force"
are not valid there, and can crash the kernel.  Delete those lines.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/shmem.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index c8f7540ef048..aad3ba74b0e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3386,8 +3386,6 @@ static const struct constant_table shmem_param_enums_huge[] = {
 	{"always",	SHMEM_HUGE_ALWAYS },
 	{"within_size",	SHMEM_HUGE_WITHIN_SIZE },
 	{"advise",	SHMEM_HUGE_ADVISE },
-	{"deny",	SHMEM_HUGE_DENY },
-	{"force",	SHMEM_HUGE_FORCE },
 	{}
 };
 

From 9eb425b2e04e0e3006adffea5bf5f227a896f128 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 18 Feb 2020 14:09:29 +0000
Subject: [PATCH 054/239] powerpc/entry: Fix an #if which should be an #ifdef
 in entry_32.S

Fixes: 12c3f1fd87bf ("powerpc/32s: get rid of CPU_FTR_601 feature")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/a99fc0ad65b87a1ba51cfa3e0e9034ee294c3e07.1582034961.git.christophe.leroy@c-s.fr
---
 arch/powerpc/kernel/entry_32.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index bc056d906b51..16af0d8d90a8 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -783,7 +783,7 @@ fast_exception_return:
 1:	lis	r3,exc_exit_restart_end@ha
 	addi	r3,r3,exc_exit_restart_end@l
 	cmplw	r12,r3
-#if CONFIG_PPC_BOOK3S_601
+#ifdef CONFIG_PPC_BOOK3S_601
 	bge	2b
 #else
 	bge	3f
@@ -791,7 +791,7 @@ fast_exception_return:
 	lis	r4,exc_exit_restart@ha
 	addi	r4,r4,exc_exit_restart@l
 	cmplw	r12,r4
-#if CONFIG_PPC_BOOK3S_601
+#ifdef CONFIG_PPC_BOOK3S_601
 	blt	2b
 #else
 	blt	3f

From 81f7eb00ff5bb8326e82503a32809421d14abb8a Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 11 Feb 2020 15:25:37 +0800
Subject: [PATCH 055/239] btrfs: destroy qgroup extent records on transaction
 abort

We clean up the delayed references when we abort a transaction but we
leave the pending qgroup extent records behind, leaking memory.

This patch destroys the extent records when we destroy the delayed refs
and makes sure ensure they're gone before releasing the transaction.

Fixes: 3368d001ba5d ("btrfs: qgroup: Record possible quota-related extent for qgroup.")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
[ Rebased to latest upstream, remove to_qgroup() helper, use
  rbtree_postorder_for_each_entry_safe() wrapper ]
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/qgroup.c      | 13 +++++++++++++
 fs/btrfs/qgroup.h      |  1 +
 fs/btrfs/transaction.c |  2 ++
 4 files changed, 17 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 89422aa8e9d1..d7fec89974cb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4276,6 +4276,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
+	btrfs_qgroup_destroy_extent_records(trans);
 
 	spin_unlock(&delayed_refs->lock);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98d9a50352d6..ff1870ff3474 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4002,3 +4002,16 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 	}
 	return ret;
 }
+
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
+{
+	struct btrfs_qgroup_extent_record *entry;
+	struct btrfs_qgroup_extent_record *next;
+	struct rb_root *root;
+
+	root = &trans->delayed_refs.dirty_extent_root;
+	rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+		ulist_free(entry->old_roots);
+		kfree(entry);
+	}
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 236f12224d52..1bc654459469 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
 		u64 last_snapshot);
 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 		struct btrfs_root *root, struct extent_buffer *eb);
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
 
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 33dcc88b428a..beb6c69cd1e5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -121,6 +121,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 		BUG_ON(!list_empty(&transaction->list));
 		WARN_ON(!RB_EMPTY_ROOT(
 				&transaction->delayed_refs.href_root.rb_root));
+		WARN_ON(!RB_EMPTY_ROOT(
+				&transaction->delayed_refs.dirty_extent_root));
 		if (transaction->delayed_refs.pending_csums)
 			btrfs_err(transaction->fs_info,
 				  "pending csums is %llu",

From 315bf8ef914f31d51d084af950703aa1e09a728c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 13 Feb 2020 10:47:28 -0500
Subject: [PATCH 056/239] btrfs: reset fs_root to NULL on error in open_ctree

While running my error injection script I hit a panic when we tried to
clean up the fs_root when freeing the fs_root.  This is because
fs_info->fs_root == PTR_ERR(-EIO), which isn't great.  Fix this by
setting fs_info->fs_root = NULL; if we fail to read the root.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d7fec89974cb..197352f23534 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3200,6 +3200,7 @@ int __cold open_ctree(struct super_block *sb,
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
 		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+		fs_info->fs_root = NULL;
 		goto fail_qgroup;
 	}
 

From 1e90315149f3fe148e114a5de86f0196d1c21fa5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 13 Feb 2020 10:47:29 -0500
Subject: [PATCH 057/239] btrfs: do not check delayed items are empty for
 single transaction cleanup

btrfs_assert_delayed_root_empty() will check if the delayed root is
completely empty, but this is a filesystem-wide check.  On cleanup we
may have allowed other transactions to begin, for whatever reason, and
thus the delayed root is not empty.

So remove this check from cleanup_one_transation().  This however can
stay in btrfs_cleanup_transaction(), because it checks only after all of
the transactions have been properly cleaned up, and thus is valid.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 197352f23534..c6c9a6a8e6c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4503,7 +4503,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
 	wake_up(&fs_info->transaction_wait);
 
 	btrfs_destroy_delayed_inodes(fs_info);
-	btrfs_assert_delayed_root_empty(fs_info);
 
 	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
 				     EXTENT_DIRTY);

From bd727173e4432fe6cb70ba108dc1f3602c5409d7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 13 Feb 2020 10:47:30 -0500
Subject: [PATCH 058/239] btrfs: handle logged extent failure properly

If we're allocating a logged extent we attempt to insert an extent
record for the file extent directly.  We increase
space_info->bytes_reserved, because the extent entry addition will call
btrfs_update_block_group(), which will convert the ->bytes_reserved to
->bytes_used.  However if we fail at any point while inserting the
extent entry we will bail and leave space on ->bytes_reserved, which
will trigger a WARN_ON() on umount.  Fix this by pinning the space if we
fail to insert, which is what happens in every other failure case that
involves adding the extent entry.

CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0163fdd59f8f..a7bc66121330 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4430,6 +4430,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 
 	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
 					 offset, ins, 1);
+	if (ret)
+		btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
 	btrfs_put_block_group(block_group);
 	return ret;
 }

From b778cf962d71a0e737923d55d0432f3bd287258e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 13 Feb 2020 10:47:31 -0500
Subject: [PATCH 059/239] btrfs: fix bytes_may_use underflow in prealloc error
 condtition

I hit the following warning while running my error injection stress
testing:

  WARNING: CPU: 3 PID: 1453 at fs/btrfs/space-info.h:108 btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs]
  RIP: 0010:btrfs_free_reserved_data_space_noquota+0xfd/0x160 [btrfs]
  Call Trace:
  btrfs_free_reserved_data_space+0x4f/0x70 [btrfs]
  __btrfs_prealloc_file_range+0x378/0x470 [btrfs]
  elfcorehdr_read+0x40/0x40
  ? elfcorehdr_read+0x40/0x40
  ? btrfs_commit_transaction+0xca/0xa50 [btrfs]
  ? dput+0xb4/0x2a0
  ? btrfs_log_dentry_safe+0x55/0x70 [btrfs]
  ? btrfs_sync_file+0x30e/0x420 [btrfs]
  ? do_fsync+0x38/0x70
  ? __x64_sys_fdatasync+0x13/0x20
  ? do_syscall_64+0x5b/0x1b0
  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9

This happens if we fail to insert our reserved file extent.  At this
point we've already converted our reservation from ->bytes_may_use to
->bytes_reserved.  However once we break we will attempt to free
everything from [cur_offset, end] from ->bytes_may_use, but our extent
reservation will overlap part of this.

Fix this problem by adding ins.offset (our extent allocation size) to
cur_offset so we remove the actual remaining part from ->bytes_may_use.

I validated this fix using my inject-error.py script

python inject-error.py -o should_fail_bio -t cache_save_setup -t \
	__btrfs_prealloc_file_range \
	-t insert_reserved_file_extent.constprop.0 \
	-r "-5" ./run-fsstress.sh

where run-fsstress.sh simply mounts and runs fsstress on a disk.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 36deef69f847..4f47ba652b31 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9824,6 +9824,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
+	u64 clear_offset = start;
 	u64 i_size;
 	u64 cur_bytes;
 	u64 last_alloc = (u64)-1;
@@ -9858,6 +9859,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 				btrfs_end_transaction(trans);
 			break;
 		}
+
+		/*
+		 * We've reserved this space, and thus converted it from
+		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
+		 * from here on out we will only need to clear our reservation
+		 * for the remaining unreserved area, so advance our
+		 * clear_offset by our extent size.
+		 */
+		clear_offset += ins.offset;
 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
 
 		last_alloc = ins.offset;
@@ -9937,9 +9947,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 		if (own_trans)
 			btrfs_end_transaction(trans);
 	}
-	if (cur_offset < end)
-		btrfs_free_reserved_data_space(inode, NULL, cur_offset,
-			end - cur_offset + 1);
+	if (clear_offset < end)
+		btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+			end - clear_offset + 1);
 	return ret;
 }
 

From e75fd33b3f744f644061a4f9662bd63f5434f806 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Feb 2020 12:29:50 +0000
Subject: [PATCH 060/239] Btrfs: fix btrfs_wait_ordered_range() so that it
 waits for all ordered extents

In btrfs_wait_ordered_range() once we find an ordered extent that has
finished with an error we exit the loop and don't wait for any other
ordered extents that might be still in progress.

All the users of btrfs_wait_ordered_range() expect that there are no more
ordered extents in progress after that function returns. So past fixes
such like the ones from the two following commits:

  ff612ba7849964 ("btrfs: fix panic during relocation after ENOSPC before
                   writeback happens")

  28aeeac1dd3080 ("Btrfs: fix panic when starting bg cache writeout after
                   IO error")

don't work when there are multiple ordered extents in the range.

Fix that by making btrfs_wait_ordered_range() wait for all ordered extents
even after it finds one that had an error.

Link: https://github.com/kdave/btrfs-progs/issues/228#issuecomment-569777554
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ordered-data.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ecb9fb6a6fe0..a65f189a5b94 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -679,10 +679,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		}
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
+		/*
+		 * If the ordered extent had an error save the error but don't
+		 * exit without waiting first for all other ordered extents in
+		 * the range to complete.
+		 */
 		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
 			ret = -EIO;
 		btrfs_put_ordered_extent(ordered);
-		if (ret || end == 0 || end == start)
+		if (end == 0 || end == start)
 			break;
 		end--;
 	}

From 929a3af90f0f4bd7132d83552c1a98c83f60ef7e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 19 Feb 2020 00:19:09 +0300
Subject: [PATCH 061/239] io_uring: fix use-after-free by io_cleanup_req()

io_cleanup_req() should be called before req->io is freed, and so
shouldn't be after __io_free_req() -> __io_req_aux_free(). Also,
it will be ignored for in io_free_req_many(), which use
__io_req_aux_free().

Place cleanup_req() into __io_req_aux_free().

Fixes: 99bc4c38537d774 ("io_uring: fix iovec leaks")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index d35b45696c73..6e249aa97ba3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1260,6 +1260,9 @@ static void __io_req_aux_free(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
+	if (req->flags & REQ_F_NEED_CLEANUP)
+		io_cleanup_req(req);
+
 	kfree(req->io);
 	if (req->file) {
 		if (req->flags & REQ_F_FIXED_FILE)
@@ -1275,9 +1278,6 @@ static void __io_free_req(struct io_kiocb *req)
 {
 	__io_req_aux_free(req);
 
-	if (req->flags & REQ_F_NEED_CLEANUP)
-		io_cleanup_req(req);
-
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
 		unsigned long flags;

From 140588bfed2727e9a4814211617a79401d74922f Mon Sep 17 00:00:00 2001
From: Stephen Kitt <steve@sk2.org>
Date: Fri, 14 Feb 2020 18:26:28 +0100
Subject: [PATCH 062/239] s390: remove obsolete ieee_emulation_warnings

s390 math emulation was removed with commit 5a79859ae0f3 ("s390:
remove 31 bit support"), rendering ieee_emulation_warnings useless.
The code still built because it was protected by CONFIG_MATHEMU, which
was no longer selectable.

This patch removes the sysctl_ieee_emulation_warnings declaration and
the sysctl entry declaration.

Link: https://lkml.kernel.org/r/20200214172628.3598516-1-steve@sk2.org
Reviewed-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Stephen Kitt <steve@sk2.org>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/processor.h | 1 -
 kernel/sysctl.c                   | 9 ---------
 2 files changed, 10 deletions(-)

diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 361ef5eda468..aadb3d0e2adc 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -84,7 +84,6 @@ void s390_update_cpu_mhz(void);
 void cpu_detect_mhz_feature(void);
 
 extern const struct seq_operations cpuinfo_op;
-extern int sysctl_ieee_emulation_warnings;
 extern void execve_tail(void);
 extern void __bpon(void);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d396aaaf19a3..ad5b88a53c5a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &maxolduid,
 	},
 #ifdef CONFIG_S390
-#ifdef CONFIG_MATHEMU
-	{
-		.procname	= "ieee_emulation_warnings",
-		.data		= &sysctl_ieee_emulation_warnings,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 	{
 		.procname	= "userprocess_debug",
 		.data		= &show_unhandled_signals,

From d0022c0ef29b78bcbe8a5c5894bd2307143afce1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 19 Feb 2020 10:19:13 +0000
Subject: [PATCH 063/239] arm64: memory: Add missing brackets to
 untagged_addr() macro

Add brackets around the evaluation of the 'addr' parameter to the
untagged_addr() macro so that the cast to 'u64' applies to the result
of the expression.

Cc: <stable@vger.kernel.org>
Fixes: 597399d0cb91 ("arm64: tags: Preserve tags for addresses translated via TTBR1")
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index a4f9ca5479b0..4d94676e5a8b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -213,7 +213,7 @@ static inline unsigned long kaslr_offset(void)
 	((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55))
 
 #define untagged_addr(addr)	({					\
-	u64 __addr = (__force u64)addr;					\
+	u64 __addr = (__force u64)(addr);					\
 	__addr &= __untagged_addr(__addr);				\
 	(__force __typeof__(addr))__addr;				\
 })

From 1fae37accfc5872af3905d4ba71dc6ab15829be7 Mon Sep 17 00:00:00 2001
From: Shyjumon N <shyjumon.n@intel.com>
Date: Thu, 6 Feb 2020 13:17:25 -0700
Subject: [PATCH 064/239] nvme/pci: Add sleep quirk for Samsung and Toshiba
 drives

The Samsung SSD SM981/PM981 and Toshiba SSD KBG40ZNT256G on the Lenovo
C640 platform experience runtime resume issues when the SSDs are kept in
sleep/suspend mode for long time.

This patch applies the 'Simple Suspend' quirk to these configurations.
With this patch, the issue had not been observed in a 1+ day test.

Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shyjumon N <shyjumon.n@intel.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9c80f9f08149..b0434b687b17 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2747,6 +2747,18 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
 		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
 		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
 			return NVME_QUIRK_NO_APST;
+	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
+		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
+		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
+		/*
+		 * Forcing to use host managed nvme power settings for
+		 * lowest idle power with quick resume latency on
+		 * Samsung and Toshiba SSDs based on suspend behavior
+		 * on Coffee Lake board for LENOVO C640
+		 */
+		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
+		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
+			return NVME_QUIRK_SIMPLE_SUSPEND;
 	}
 
 	return 0;

From 98f7b86a0becc1154b1a6df6e75c9695dfd87e0d Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 12 Feb 2020 12:32:18 +0200
Subject: [PATCH 065/239] nvme-pci: Use single IRQ vector for old Apple models

People reported that old Apple machines are not working properly
if the non-first IRQ vector is in use.

Set quirk for that models to limit IRQ to use first vector only.

Based on original patch by GitHub user npx001.

Link: https://github.com/Dunedan/mbp-2016-linux/issues/9
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Leif Liddy <leif.liddy@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b0434b687b17..ace4dd9e953c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3121,7 +3121,8 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
 				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
+		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
 		.driver_data = NVME_QUIRK_SINGLE_VECTOR |

From df6d4f9db79c1a5d6f48b59db35ccd1e9ff9adfc Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 16 Jan 2020 12:46:51 -0800
Subject: [PATCH 066/239] x86/boot/compressed: Don't declare __force_order in
 kaslr_64.c

GCC 10 changed the default to -fno-common, which leads to

    LD      arch/x86/boot/compressed/vmlinux
  ld: arch/x86/boot/compressed/pgtable_64.o:(.bss+0x0): multiple definition of `__force_order'; \
    arch/x86/boot/compressed/kaslr_64.o:(.bss+0x0): first defined here
  make[2]: *** [arch/x86/boot/compressed/Makefile:119: arch/x86/boot/compressed/vmlinux] Error 1

Since __force_order is already provided in pgtable_64.c, there is no
need to declare __force_order in kaslr_64.c.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20200124181811.4780-1-hjl.tools@gmail.com
---
 arch/x86/boot/compressed/kaslr_64.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c
index 748456c365f4..9557c5a15b91 100644
--- a/arch/x86/boot/compressed/kaslr_64.c
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -29,9 +29,6 @@
 #define __PAGE_OFFSET __PAGE_OFFSET_BASE
 #include "../../mm/ident_map.c"
 
-/* Used by pgtable.h asm code to force instruction serialization. */
-unsigned long __force_order;
-
 /* Used to track our page table allocation area. */
 struct alloc_pgt_data {
 	unsigned char *pgt_buf;

From e9091ffd6a0aaced111b5d6ead5eaab5cd7101bc Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Mon, 10 Feb 2020 10:48:11 +0100
Subject: [PATCH 067/239] s390/qdio: fill SL with absolute addresses

As the comment says, sl->sbal holds an absolute address. qeth currently
solves this through wild casting, while zfcp doesn't care.

Handle this properly in the code that actually builds the SL.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Reviewed-by: Alexandra Winter <wintera@linux.ibm.com>
Reviewed-by: Steffen Maier <maier@linux.ibm.com> [for qdio]
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/qdio.h      |  2 +-
 drivers/s390/cio/qdio_setup.c     |  3 ++-
 drivers/s390/net/qeth_core_main.c | 23 +++++++++++------------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 71e3f0146cda..7870cf834533 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -227,7 +227,7 @@ struct qdio_buffer {
  * @sbal: absolute SBAL address
  */
 struct sl_element {
-	unsigned long sbal;
+	u64 sbal;
 } __attribute__ ((packed));
 
 /**
diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c
index 3ab8e80d7bbc..e115623b86b2 100644
--- a/drivers/s390/cio/qdio_setup.c
+++ b/drivers/s390/cio/qdio_setup.c
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/io.h>
 #include <asm/qdio.h>
 
 #include "cio.h"
@@ -205,7 +206,7 @@ static void setup_storage_lists(struct qdio_q *q, struct qdio_irq *irq_ptr,
 
 	/* fill in sl */
 	for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
-		q->sl->element[j].sbal = (unsigned long)q->sbal[j];
+		q->sl->element[j].sbal = virt_to_phys(q->sbal[j]);
 }
 
 static void setup_queues(struct qdio_irq *irq_ptr,
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 9639938581f5..2c83d489f099 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -4746,10 +4746,10 @@ static void qeth_qdio_establish_cq(struct qeth_card *card,
 	if (card->options.cq == QETH_CQ_ENABLED) {
 		int offset = QDIO_MAX_BUFFERS_PER_Q *
 			     (card->qdio.no_in_queues - 1);
-		for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i) {
-			in_sbal_ptrs[offset + i] = (struct qdio_buffer *)
-				virt_to_phys(card->qdio.c_q->bufs[i].buffer);
-		}
+
+		for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
+			in_sbal_ptrs[offset + i] =
+				card->qdio.c_q->bufs[i].buffer;
 
 		queue_start_poll[card->qdio.no_in_queues - 1] = NULL;
 	}
@@ -4783,10 +4783,9 @@ static int qeth_qdio_establish(struct qeth_card *card)
 		rc = -ENOMEM;
 		goto out_free_qib_param;
 	}
-	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i) {
-		in_sbal_ptrs[i] = (struct qdio_buffer *)
-			virt_to_phys(card->qdio.in_q->bufs[i].buffer);
-	}
+
+	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++)
+		in_sbal_ptrs[i] = card->qdio.in_q->bufs[i].buffer;
 
 	queue_start_poll = kcalloc(card->qdio.no_in_queues, sizeof(void *),
 				   GFP_KERNEL);
@@ -4807,11 +4806,11 @@ static int qeth_qdio_establish(struct qeth_card *card)
 		rc = -ENOMEM;
 		goto out_free_queue_start_poll;
 	}
+
 	for (i = 0, k = 0; i < card->qdio.no_out_queues; ++i)
-		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j, ++k) {
-			out_sbal_ptrs[k] = (struct qdio_buffer *)virt_to_phys(
-				card->qdio.out_qs[i]->bufs[j]->buffer);
-		}
+		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++, k++)
+			out_sbal_ptrs[k] =
+				card->qdio.out_qs[i]->bufs[j]->buffer;
 
 	memset(&init_data, 0, sizeof(struct qdio_initialize));
 	init_data.cdev                   = CARD_DDEV(card);

From 2db01da8d25f0420c411e788a9e1ba39269ae37b Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 11 Feb 2020 09:27:38 +0100
Subject: [PATCH 068/239] s390/qdio: fill SBALEs with absolute addresses

sbale->addr holds an absolute address (or for some FCP usage, an opaque
request ID), and should only be used with proper virt/phys translation.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Reviewed-by: Benjamin Block <bblock@linux.ibm.com>
Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
---
 arch/s390/include/asm/qdio.h      |  4 ++--
 drivers/s390/net/qeth_core_main.c | 23 ++++++++++++-----------
 drivers/s390/scsi/zfcp_fsf.c      |  2 +-
 drivers/s390/scsi/zfcp_qdio.c     |  6 +++---
 drivers/s390/scsi/zfcp_qdio.h     |  6 +++---
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h
index 7870cf834533..1e3517b0518b 100644
--- a/arch/s390/include/asm/qdio.h
+++ b/arch/s390/include/asm/qdio.h
@@ -201,7 +201,7 @@ struct slib {
  * @scount: SBAL count
  * @sflags: whole SBAL flags
  * @length: length
- * @addr: address
+ * @addr: absolute data address
 */
 struct qdio_buffer_element {
 	u8 eflags;
@@ -211,7 +211,7 @@ struct qdio_buffer_element {
 	u8 scount;
 	u8 sflags;
 	u32 length;
-	void *addr;
+	u64 addr;
 } __attribute__ ((packed, aligned(16)));
 
 /**
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 2c83d489f099..03935466265c 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -1128,9 +1128,10 @@ static void qeth_clear_output_buffer(struct qeth_qdio_out_q *queue,
 	qeth_tx_complete_buf(buf, error, budget);
 
 	for (i = 0; i < queue->max_elements; ++i) {
-		if (buf->buffer->element[i].addr && buf->is_header[i])
-			kmem_cache_free(qeth_core_header_cache,
-				buf->buffer->element[i].addr);
+		void *data = phys_to_virt(buf->buffer->element[i].addr);
+
+		if (data && buf->is_header[i])
+			kmem_cache_free(qeth_core_header_cache, data);
 		buf->is_header[i] = 0;
 	}
 
@@ -2641,7 +2642,8 @@ static int qeth_init_input_buffer(struct qeth_card *card,
 	buf->pool_entry = pool_entry;
 	for (i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i) {
 		buf->buffer->element[i].length = PAGE_SIZE;
-		buf->buffer->element[i].addr =  pool_entry->elements[i];
+		buf->buffer->element[i].addr =
+			virt_to_phys(pool_entry->elements[i]);
 		if (i == QETH_MAX_BUFFER_ELEMENTS(card) - 1)
 			buf->buffer->element[i].eflags = SBAL_EFLAGS_LAST_ENTRY;
 		else
@@ -3459,9 +3461,8 @@ static void qeth_qdio_cq_handler(struct qeth_card *card, unsigned int qdio_err,
 
 		while ((e < QDIO_MAX_ELEMENTS_PER_BUFFER) &&
 		       buffer->element[e].addr) {
-			unsigned long phys_aob_addr;
+			unsigned long phys_aob_addr = buffer->element[e].addr;
 
-			phys_aob_addr = (unsigned long) buffer->element[e].addr;
 			qeth_qdio_handle_aob(card, phys_aob_addr);
 			++e;
 		}
@@ -3750,7 +3751,7 @@ static unsigned int __qeth_fill_buffer(struct sk_buff *skb,
 		elem_length = min_t(unsigned int, length,
 				    PAGE_SIZE - offset_in_page(data));
 
-		buffer->element[element].addr = data;
+		buffer->element[element].addr = virt_to_phys(data);
 		buffer->element[element].length = elem_length;
 		length -= elem_length;
 		if (is_first_elem) {
@@ -3780,7 +3781,7 @@ static unsigned int __qeth_fill_buffer(struct sk_buff *skb,
 			elem_length = min_t(unsigned int, length,
 					    PAGE_SIZE - offset_in_page(data));
 
-			buffer->element[element].addr = data;
+			buffer->element[element].addr = virt_to_phys(data);
 			buffer->element[element].length = elem_length;
 			buffer->element[element].eflags =
 				SBAL_EFLAGS_MIDDLE_FRAG;
@@ -3820,7 +3821,7 @@ static unsigned int qeth_fill_buffer(struct qeth_qdio_out_buffer *buf,
 		int element = buf->next_element_to_fill;
 		is_first_elem = false;
 
-		buffer->element[element].addr = hdr;
+		buffer->element[element].addr = virt_to_phys(hdr);
 		buffer->element[element].length = hd_len;
 		buffer->element[element].eflags = SBAL_EFLAGS_FIRST_FRAG;
 		/* remember to free cache-allocated qeth_hdr: */
@@ -5288,7 +5289,7 @@ static int qeth_extract_skb(struct qeth_card *card,
 		offset = 0;
 	}
 
-	hdr = element->addr + offset;
+	hdr = phys_to_virt(element->addr) + offset;
 	offset += sizeof(*hdr);
 	skb = NULL;
 
@@ -5387,7 +5388,7 @@ static int qeth_extract_skb(struct qeth_card *card,
 walk_packet:
 	while (skb_len) {
 		int data_len = min(skb_len, (int)(element->length - offset));
-		char *data = element->addr + offset;
+		char *data = phys_to_virt(element->addr) + offset;
 
 		skb_len -= data_len;
 		offset += data_len;
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 223a805f0b0b..cae9b7ff79b0 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -2510,7 +2510,7 @@ void zfcp_fsf_reqid_check(struct zfcp_qdio *qdio, int sbal_idx)
 	for (idx = 0; idx < QDIO_MAX_ELEMENTS_PER_BUFFER; idx++) {
 
 		sbale = &sbal->element[idx];
-		req_id = (unsigned long) sbale->addr;
+		req_id = sbale->addr;
 		fsf_req = zfcp_reqlist_find_rm(adapter->req_list, req_id);
 
 		if (!fsf_req) {
diff --git a/drivers/s390/scsi/zfcp_qdio.c b/drivers/s390/scsi/zfcp_qdio.c
index 661436a92f8e..f0d6296e673b 100644
--- a/drivers/s390/scsi/zfcp_qdio.c
+++ b/drivers/s390/scsi/zfcp_qdio.c
@@ -98,7 +98,7 @@ static void zfcp_qdio_int_resp(struct ccw_device *cdev, unsigned int qdio_err,
 			memset(pl, 0,
 			       ZFCP_QDIO_MAX_SBALS_PER_REQ * sizeof(void *));
 			sbale = qdio->res_q[idx]->element;
-			req_id = (u64) sbale->addr;
+			req_id = sbale->addr;
 			scount = min(sbale->scount + 1,
 				     ZFCP_QDIO_MAX_SBALS_PER_REQ + 1);
 				     /* incl. signaling SBAL */
@@ -199,7 +199,7 @@ int zfcp_qdio_sbals_from_sg(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req,
 					     q_req->sbal_number);
 			return -EINVAL;
 		}
-		sbale->addr = sg_virt(sg);
+		sbale->addr = sg_phys(sg);
 		sbale->length = sg->length;
 	}
 	return 0;
@@ -418,7 +418,7 @@ int zfcp_qdio_open(struct zfcp_qdio *qdio)
 		sbale->length = 0;
 		sbale->eflags = SBAL_EFLAGS_LAST_ENTRY;
 		sbale->sflags = 0;
-		sbale->addr = NULL;
+		sbale->addr = 0;
 	}
 
 	if (do_QDIO(cdev, QDIO_FLAG_SYNC_INPUT, 0, 0, QDIO_MAX_BUFFERS_PER_Q))
diff --git a/drivers/s390/scsi/zfcp_qdio.h b/drivers/s390/scsi/zfcp_qdio.h
index 2a816a37b3c0..6b43d6b254be 100644
--- a/drivers/s390/scsi/zfcp_qdio.h
+++ b/drivers/s390/scsi/zfcp_qdio.h
@@ -122,14 +122,14 @@ void zfcp_qdio_req_init(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req,
 					% QDIO_MAX_BUFFERS_PER_Q;
 
 	sbale = zfcp_qdio_sbale_req(qdio, q_req);
-	sbale->addr = (void *) req_id;
+	sbale->addr = req_id;
 	sbale->eflags = 0;
 	sbale->sflags = SBAL_SFLAGS0_COMMAND | sbtype;
 
 	if (unlikely(!data))
 		return;
 	sbale++;
-	sbale->addr = data;
+	sbale->addr = virt_to_phys(data);
 	sbale->length = len;
 }
 
@@ -152,7 +152,7 @@ void zfcp_qdio_fill_next(struct zfcp_qdio *qdio, struct zfcp_qdio_req *q_req,
 	BUG_ON(q_req->sbale_curr == qdio->max_sbale_per_sbal - 1);
 	q_req->sbale_curr++;
 	sbale = zfcp_qdio_sbale_curr(qdio, q_req);
-	sbale->addr = data;
+	sbale->addr = virt_to_phys(data);
 	sbale->length = len;
 }
 

From 15755854d53b4bbb0bb37a0fce66f0156cfc8a17 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 20 Feb 2020 00:59:36 +0900
Subject: [PATCH 069/239] nvme: Fix uninitialized-variable warning

gcc may detect a false positive on nvme using an unintialized variable
if setting features fails. Since this is not a fast path, explicitly
initialize this variable to suppress the warning.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ada59df642d2..a4d8c90ee7cc 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1165,8 +1165,8 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl,
 static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
 		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
 {
+	union nvme_result res = { 0 };
 	struct nvme_command c;
-	union nvme_result res;
 	int ret;
 
 	memset(&c, 0, sizeof(c));

From 72cf3b3df423c1bbd8fa1056fed009d3a260f8a9 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 17 Feb 2020 14:11:49 -0700
Subject: [PATCH 070/239] MIPS: vdso: Wrap -mexplicit-relocs in cc-option

Clang does not support this option and errors out:

clang-11: error: unknown argument: '-mexplicit-relocs'

Clang does not appear to need this flag like GCC does because the jalr
check that was added in commit 976c23af3ee5 ("mips: vdso: add build
time check that no 'jalr t9' calls left") passes just fine with

$ make ARCH=mips CC=clang CROSS_COMPILE=mipsel-linux-gnu- malta_defconfig arch/mips/vdso/

even before commit d3f703c4359f ("mips: vdso: fix 'jalr t9' crash in
vdso code").

-mrelax-pic-calls has been supported since clang 9, which is the
earliest version that could build a working MIPS kernel, and it is the
default for clang so just leave it be.

Fixes: d3f703c4359f ("mips: vdso: fix 'jalr t9' crash in vdso code")
Link: https://github.com/ClangBuiltLinux/linux/issues/890
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: clang-built-linux@googlegroups.com
---
 arch/mips/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 77374c1f0c77..d7fe8408603e 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -33,7 +33,7 @@ endif
 cflags-vdso := $(ccflags-vdso) \
 	$(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
 	-O3 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \
-	-mrelax-pic-calls -mexplicit-relocs \
+	-mrelax-pic-calls $(call cc-option, -mexplicit-relocs) \
 	-fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \
 	$(call cc-option, -fno-asynchronous-unwind-tables) \
 	$(call cc-option, -fno-stack-protector)

From eb41113870c94dd7e519c69184efc171b7327699 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?=
 <zhouyanjie@wanyeetech.com>
Date: Sun, 16 Feb 2020 20:10:28 +0800
Subject: [PATCH 071/239] MIPS: X1000: Fix clock of watchdog node.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The devicetree ABI was broken on purpose by commit 6d532143c915
("watchdog: jz4740: Use regmap provided by TCU driver"), and
commit 1d9c30745455 ("watchdog: jz4740: Use WDT clock provided
by TCU driver"). The commit message of the latter explains why the ABI
was broken.

However, the current devicetree files were not updated to the new ABI
described in Documentation/devicetree/bindings/timer/ingenic,tcu.txt,
so the watchdog driver would not probe.

Fix this problem by updating the clock of watchdog node from
"&cgu X1000_CLK_RTCLK" to "&tcu TCU_CLK_WDT" to comply with the new
ABI.

Fixes: 7a16ccd300c2 ("[v8,1/4] MIPS: Ingenic: Add Ingenic X1000 support.").
Signed-off-by: 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: linux-mips@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: devicetree@vger.kernel.org
Cc: paul@crapouillou.net
Cc: robh+dt@kernel.org
Cc: mark.rutland@arm.com
Cc: ralf@linux-mips.org
Cc: sernia.zhou@foxmail.com
Cc: zhenwenjin@gmail.com
Cc: dongsheng.qiu@ingenic.com
---
 arch/mips/boot/dts/ingenic/x1000.dtsi | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/mips/boot/dts/ingenic/x1000.dtsi b/arch/mips/boot/dts/ingenic/x1000.dtsi
index 4994c695a1a7..147f7d5c243a 100644
--- a/arch/mips/boot/dts/ingenic/x1000.dtsi
+++ b/arch/mips/boot/dts/ingenic/x1000.dtsi
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <dt-bindings/clock/ingenic,tcu.h>
 #include <dt-bindings/clock/x1000-cgu.h>
 #include <dt-bindings/dma/x1000-dma.h>
 
@@ -72,7 +73,7 @@ wdt: watchdog@0 {
 			compatible = "ingenic,x1000-watchdog", "ingenic,jz4780-watchdog";
 			reg = <0x0 0x10>;
 
-			clocks = <&cgu X1000_CLK_RTCLK>;
+			clocks = <&tcu TCU_CLK_WDT>;
 			clock-names = "wdt";
 		};
 	};
@@ -158,7 +159,6 @@ gpd: gpio@3 {
 	i2c0: i2c-controller@10050000 {
 		compatible = "ingenic,x1000-i2c";
 		reg = <0x10050000 0x1000>;
-
 		#address-cells = <1>;
 		#size-cells = <0>;
 
@@ -173,7 +173,6 @@ i2c0: i2c-controller@10050000 {
 	i2c1: i2c-controller@10051000 {
 		compatible = "ingenic,x1000-i2c";
 		reg = <0x10051000 0x1000>;
-
 		#address-cells = <1>;
 		#size-cells = <0>;
 
@@ -188,7 +187,6 @@ i2c1: i2c-controller@10051000 {
 	i2c2: i2c-controller@10052000 {
 		compatible = "ingenic,x1000-i2c";
 		reg = <0x10052000 0x1000>;
-
 		#address-cells = <1>;
 		#size-cells = <0>;
 

From 11479e8e3cd896673a15af21cd0f145a4752f01a Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Tue, 11 Feb 2020 11:53:37 -0300
Subject: [PATCH 072/239] MIPS: ingenic: DTS: Fix watchdog nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The devicetree ABI was broken on purpose by commit 6d532143c915
("watchdog: jz4740: Use regmap provided by TCU driver"), and
commit 1d9c30745455 ("watchdog: jz4740: Use WDT clock provided
by TCU driver"). The commit message of the latter explains why the ABI
was broken.

However, the current devicetree files were not updated to the new ABI
described in Documentation/devicetree/bindings/timer/ingenic,tcu.txt,
so the watchdog driver would not probe.

Fix this problem by updating the watchdog nodes to comply with the new
ABI.

Fixes: 6d532143c915 ("watchdog: jz4740: Use regmap provided by TCU driver")
Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Paul Burton <paulburton@kernel.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Zhou Yanjie <zhouyanjie@wanyeetech.com>
Cc: od@zcrc.me
Cc: linux-mips@vger.kernel.org
Cc: devicetree@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # v5.5+
---
 arch/mips/boot/dts/ingenic/jz4740.dtsi | 17 +++++++++--------
 arch/mips/boot/dts/ingenic/jz4780.dtsi | 17 +++++++++--------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/mips/boot/dts/ingenic/jz4740.dtsi b/arch/mips/boot/dts/ingenic/jz4740.dtsi
index 5accda2767be..a3301bab9231 100644
--- a/arch/mips/boot/dts/ingenic/jz4740.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4740.dtsi
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <dt-bindings/clock/jz4740-cgu.h>
+#include <dt-bindings/clock/ingenic,tcu.h>
 
 / {
 	#address-cells = <1>;
@@ -45,14 +46,6 @@ cgu: jz4740-cgu@10000000 {
 		#clock-cells = <1>;
 	};
 
-	watchdog: watchdog@10002000 {
-		compatible = "ingenic,jz4740-watchdog";
-		reg = <0x10002000 0x10>;
-
-		clocks = <&cgu JZ4740_CLK_RTC>;
-		clock-names = "rtc";
-	};
-
 	tcu: timer@10002000 {
 		compatible = "ingenic,jz4740-tcu", "simple-mfd";
 		reg = <0x10002000 0x1000>;
@@ -73,6 +66,14 @@ &cgu JZ4740_CLK_PCLK
 
 		interrupt-parent = <&intc>;
 		interrupts = <23 22 21>;
+
+		watchdog: watchdog@0 {
+			compatible = "ingenic,jz4740-watchdog";
+			reg = <0x0 0xc>;
+
+			clocks = <&tcu TCU_CLK_WDT>;
+			clock-names = "wdt";
+		};
 	};
 
 	rtc_dev: rtc@10003000 {
diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi
index f928329b034b..bb89653d16a3 100644
--- a/arch/mips/boot/dts/ingenic/jz4780.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <dt-bindings/clock/jz4780-cgu.h>
+#include <dt-bindings/clock/ingenic,tcu.h>
 #include <dt-bindings/dma/jz4780-dma.h>
 
 / {
@@ -67,6 +68,14 @@ &cgu JZ4780_CLK_EXCLK
 
 		interrupt-parent = <&intc>;
 		interrupts = <27 26 25>;
+
+		watchdog: watchdog@0 {
+			compatible = "ingenic,jz4780-watchdog";
+			reg = <0x0 0xc>;
+
+			clocks = <&tcu TCU_CLK_WDT>;
+			clock-names = "wdt";
+		};
 	};
 
 	rtc_dev: rtc@10003000 {
@@ -348,14 +357,6 @@ i2c4: i2c@10054000 {
 		status = "disabled";
 	};
 
-	watchdog: watchdog@10002000 {
-		compatible = "ingenic,jz4780-watchdog";
-		reg = <0x10002000 0x10>;
-
-		clocks = <&cgu JZ4780_CLK_RTCLK>;
-		clock-names = "rtc";
-	};
-
 	nemc: nemc@13410000 {
 		compatible = "ingenic,jz4780-nemc";
 		reg = <0x13410000 0x10000>;

From 21b5ee59ef18e27d85810584caf1f7ddc705ea83 Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips@amd.com>
Date: Wed, 19 Feb 2020 18:52:43 +0100
Subject: [PATCH 073/239] x86/cpu/amd: Enable the fixed Instructions Retired
 counter IRPERF

Commit

  aaf248848db50 ("perf/x86/msr: Add AMD IRPERF (Instructions Retired)
		  performance counter")

added support for access to the free-running counter via 'perf -e
msr/irperf/', but when exercised, it always returns a 0 count:

BEFORE:

  $ perf stat -e instructions,msr/irperf/ true

   Performance counter stats for 'true':

             624,833      instructions
                   0      msr/irperf/

Simply set its enable bit - HWCR bit 30 - to make it start counting.

Enablement is restricted to all machines advertising IRPERF capability,
except those susceptible to an erratum that makes the IRPERF return
bad values.

That erratum occurs in Family 17h models 00-1fh [1], but not in F17h
models 20h and above [2].

AFTER (on a family 17h model 31h machine):

  $ perf stat -e instructions,msr/irperf/ true

   Performance counter stats for 'true':

             621,690      instructions
             622,490      msr/irperf/

[1] Revision Guide for AMD Family 17h Models 00h-0Fh Processors
[2] Revision Guide for AMD Family 17h Models 30h-3Fh Processors

The revision guides are available from the bugzilla Link below.

 [ bp: Massage commit message. ]

Fixes: aaf248848db50 ("perf/x86/msr: Add AMD IRPERF (Instructions Retired) performance counter")
Signed-off-by: Kim Phillips <kim.phillips@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537
Link: http://lkml.kernel.org/r/20200214201805.13830-1-kim.phillips@amd.com
---
 arch/x86/include/asm/msr-index.h |  2 ++
 arch/x86/kernel/cpu/amd.c        | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ebe1685e92dd..d5e517d1c3dd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -512,6 +512,8 @@
 #define MSR_K7_HWCR			0xc0010015
 #define MSR_K7_HWCR_SMMLOCK_BIT		0
 #define MSR_K7_HWCR_SMMLOCK		BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_HWCR_IRPERF_EN_BIT	30
+#define MSR_K7_HWCR_IRPERF_EN		BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ac83a0fef628..1f875fbe1384 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -28,6 +28,7 @@
 
 static const int amd_erratum_383[];
 static const int amd_erratum_400[];
+static const int amd_erratum_1054[];
 static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
 
 /*
@@ -972,6 +973,15 @@ static void init_amd(struct cpuinfo_x86 *c)
 	/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
 	if (!cpu_has(c, X86_FEATURE_XENPV))
 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+	/*
+	 * Turn on the Instructions Retired free counter on machines not
+	 * susceptible to erratum #1054 "Instructions Retired Performance
+	 * Counter May Be Inaccurate".
+	 */
+	if (cpu_has(c, X86_FEATURE_IRPERF) &&
+	    !cpu_has_amd_erratum(c, amd_erratum_1054))
+		msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
 }
 
 #ifdef CONFIG_X86_32
@@ -1099,6 +1109,10 @@ static const int amd_erratum_400[] =
 static const int amd_erratum_383[] =
 	AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
 
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+	AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
 
 static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
 {

From dde54b9492a8ba46bcd7e7e26172adf2bfcea817 Mon Sep 17 00:00:00 2001
From: Heidi Fahim <heidifahim@google.com>
Date: Tue, 26 Nov 2019 14:36:16 -0800
Subject: [PATCH 074/239] kunit: test: Improve error messages for kunit_tool
 when kunitconfig is invalid

Previous error message for invalid kunitconfig was vague. Added to it so
that it lists invalid fields and prompts for them to be removed.  Added
validate_config function returning whether or not this kconfig is valid.

Signed-off-by: Heidi Fahim <heidifahim@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Tested-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit_kernel.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/tools/testing/kunit/kunit_kernel.py b/tools/testing/kunit/kunit_kernel.py
index cc5d844ecca1..d99ae75ef72f 100644
--- a/tools/testing/kunit/kunit_kernel.py
+++ b/tools/testing/kunit/kunit_kernel.py
@@ -93,6 +93,20 @@ class LinuxSourceTree(object):
 			return False
 		return True
 
+	def validate_config(self, build_dir):
+		kconfig_path = get_kconfig_path(build_dir)
+		validated_kconfig = kunit_config.Kconfig()
+		validated_kconfig.read_from_file(kconfig_path)
+		if not self._kconfig.is_subset_of(validated_kconfig):
+			invalid = self._kconfig.entries() - validated_kconfig.entries()
+			message = 'Provided Kconfig is not contained in validated .config. Following fields found in kunitconfig, ' \
+					  'but not in .config: %s' % (
+					', '.join([str(e) for e in invalid])
+			)
+			logging.error(message)
+			return False
+		return True
+
 	def build_config(self, build_dir):
 		kconfig_path = get_kconfig_path(build_dir)
 		if build_dir and not os.path.exists(build_dir):
@@ -103,12 +117,7 @@ class LinuxSourceTree(object):
 		except ConfigError as e:
 			logging.error(e)
 			return False
-		validated_kconfig = kunit_config.Kconfig()
-		validated_kconfig.read_from_file(kconfig_path)
-		if not self._kconfig.is_subset_of(validated_kconfig):
-			logging.error('Provided Kconfig is not contained in validated .config!')
-			return False
-		return True
+		return self.validate_config(build_dir)
 
 	def build_reconfig(self, build_dir):
 		"""Creates a new .config if it is not a subset of the .kunitconfig."""
@@ -133,12 +142,7 @@ class LinuxSourceTree(object):
 		except (ConfigError, BuildError) as e:
 			logging.error(e)
 			return False
-		used_kconfig = kunit_config.Kconfig()
-		used_kconfig.read_from_file(get_kconfig_path(build_dir))
-		if not self._kconfig.is_subset_of(used_kconfig):
-			logging.error('Provided Kconfig is not contained in final config!')
-			return False
-		return True
+		return self.validate_config(build_dir)
 
 	def run_kernel(self, args=[], timeout=None, build_dir=''):
 		args.extend(['mem=256M'])

From be886ba90cce2fb2f5a4dbcda8f3be3fd1b2f484 Mon Sep 17 00:00:00 2001
From: Heidi Fahim <heidifahim@google.com>
Date: Tue, 18 Feb 2020 14:19:16 -0800
Subject: [PATCH 075/239] kunit: run kunit_tool from any directory

Implemented small fix so that the script changes work directories to the
root of the linux kernel source tree from which kunit.py is run. This
enables the user to run kunit from any working directory. Originally
considered using os.path.join but this is more error prone as we would
have to find all file path usages and modify them accordingly. Using
os.chdir ensures that the entire script is run within /linux.

Signed-off-by: Heidi Fahim <heidifahim@google.com>
Reviewed-by: Brendan Higgins <brendanhiggins@google.com>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/kunit/kunit.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/kunit/kunit.py b/tools/testing/kunit/kunit.py
index e59eb9e7f923..180ad1e1b04f 100755
--- a/tools/testing/kunit/kunit.py
+++ b/tools/testing/kunit/kunit.py
@@ -24,6 +24,8 @@ KunitResult = namedtuple('KunitResult', ['status','result'])
 
 KunitRequest = namedtuple('KunitRequest', ['raw_output','timeout', 'jobs', 'build_dir', 'defconfig'])
 
+KernelDirectoryPath = sys.argv[0].split('tools/testing/kunit/')[0]
+
 class KunitStatus(Enum):
 	SUCCESS = auto()
 	CONFIG_FAILURE = auto()
@@ -35,6 +37,13 @@ def create_default_kunitconfig():
 		shutil.copyfile('arch/um/configs/kunit_defconfig',
 				kunit_kernel.kunitconfig_path)
 
+def get_kernel_root_path():
+	parts = sys.argv[0] if not __file__ else __file__
+	parts = os.path.realpath(parts).split('tools/testing/kunit')
+	if len(parts) != 2:
+		sys.exit(1)
+	return parts[0]
+
 def run_tests(linux: kunit_kernel.LinuxSourceTree,
 	      request: KunitRequest) -> KunitResult:
 	config_start = time.time()
@@ -114,6 +123,9 @@ def main(argv, linux=None):
 	cli_args = parser.parse_args(argv)
 
 	if cli_args.subcommand == 'run':
+		if get_kernel_root_path():
+			os.chdir(get_kernel_root_path())
+
 		if cli_args.build_dir:
 			if not os.path.exists(cli_args.build_dir):
 				os.mkdir(cli_args.build_dir)

From e7167043ee508739fc9c5030494f94bea48cee23 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 12 Feb 2020 19:34:23 +0900
Subject: [PATCH 076/239] riscv: Fix gitignore

Tell git to not track the compiled boot/loader and boot/loader.lds
files.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/boot/.gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/riscv/boot/.gitignore b/arch/riscv/boot/.gitignore
index 8dab0bb6ae66..8a45a37d2af4 100644
--- a/arch/riscv/boot/.gitignore
+++ b/arch/riscv/boot/.gitignore
@@ -1,2 +1,4 @@
 Image
 Image.gz
+loader
+loader.lds

From 88d5271c1efbd89713092b962070cf9af2fcd7c4 Mon Sep 17 00:00:00 2001
From: Tomas Paukrt <tomaspaukrt@email.cz>
Date: Wed, 22 Jan 2020 21:10:39 +0100
Subject: [PATCH 077/239] dt-bindings: mmc: omap-hsmmc: Fix SDIO interrupt

SDIO interrupt must be specified correctly as IRQ_TYPE_LEVEL_LOW instead of GPIO_ACTIVE_LOW.

Signed-off-by: Tomas Paukrt <tomaspaukrt@email.cz>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
index 19f5508a7569..4a9145ef15d6 100644
--- a/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
+++ b/Documentation/devicetree/bindings/mmc/ti-omap-hsmmc.txt
@@ -124,7 +124,7 @@ not every application needs SDIO irq, e.g. MMC cards.
 		pinctrl-1 = <&mmc1_idle>;
 		pinctrl-2 = <&mmc1_sleep>;
 		...
-		interrupts-extended = <&intc 64 &gpio2 28 GPIO_ACTIVE_LOW>;
+		interrupts-extended = <&intc 64 &gpio2 28 IRQ_TYPE_LEVEL_LOW>;
 	};
 
 	mmc1_idle : pinmux_cirq_pin {

From 3044d9891bdb98ebae5c9e6fb1bd7f007bf7cad6 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 14 Jan 2020 13:38:21 +0100
Subject: [PATCH 078/239] dt-bindings: memory-controller: Update example for
 Tegra124 EMC

The example in the Tegra124 EMC device tree binding looks like an old
version that doesn't contain all the required fields. Update it with a
version from the current DTS files to fix the make dt_binding_check
target.

Reported-by: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Thierry Reding <treding@nvidia.com>
[robh: also fix missing '#reset-cells']
Signed-off-by: Rob Herring <robh@kernel.org>
---
 .../nvidia,tegra124-emc.yaml                  | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml
index dd1843489ad1..3e0a8a92d652 100644
--- a/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml
+++ b/Documentation/devicetree/bindings/memory-controllers/nvidia,tegra124-emc.yaml
@@ -347,6 +347,7 @@ examples:
         interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
 
         #iommu-cells = <1>;
+        #reset-cells = <1>;
     };
 
     external-memory-controller@7001b000 {
@@ -363,20 +364,23 @@ examples:
             timing-0 {
                 clock-frequency = <12750000>;
 
-                nvidia,emc-zcal-cnt-long = <0x00000042>;
-                nvidia,emc-auto-cal-interval = <0x001fffff>;
-                nvidia,emc-ctt-term-ctrl = <0x00000802>;
-                nvidia,emc-cfg = <0x73240000>;
-                nvidia,emc-cfg-2 = <0x000008c5>;
-                nvidia,emc-sel-dpd-ctrl = <0x00040128>;
-                nvidia,emc-bgbias-ctl0 = <0x00000008>;
                 nvidia,emc-auto-cal-config = <0xa1430000>;
                 nvidia,emc-auto-cal-config2 = <0x00000000>;
                 nvidia,emc-auto-cal-config3 = <0x00000000>;
-                nvidia,emc-mode-reset = <0x80001221>;
+                nvidia,emc-auto-cal-interval = <0x001fffff>;
+                nvidia,emc-bgbias-ctl0 = <0x00000008>;
+                nvidia,emc-cfg = <0x73240000>;
+                nvidia,emc-cfg-2 = <0x000008c5>;
+                nvidia,emc-ctt-term-ctrl = <0x00000802>;
                 nvidia,emc-mode-1 = <0x80100003>;
                 nvidia,emc-mode-2 = <0x80200008>;
                 nvidia,emc-mode-4 = <0x00000000>;
+                nvidia,emc-mode-reset = <0x80001221>;
+                nvidia,emc-mrs-wait-cnt = <0x000e000e>;
+                nvidia,emc-sel-dpd-ctrl = <0x00040128>;
+                nvidia,emc-xm2dqspadctrl2 = <0x0130b118>;
+                nvidia,emc-zcal-cnt-long = <0x00000042>;
+                nvidia,emc-zcal-interval = <0x00000000>;
 
                 nvidia,emc-configuration = <
                     0x00000000 /* EMC_RC */

From 867c1859590f3bbc13d9cfb46c90c8202769d0e5 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 14 Feb 2020 21:44:08 +0200
Subject: [PATCH 079/239] dt-bindings: net: mdio: remove compatible string from
 example

Remove vendor specific compatible string from example, otherwise DT YAML
schemas validation may trigger warnings specific to TI ti,davinci_mdio
and not to the generic MDIO example.

For example, the "bus_freq" is required for davinci_mdio, but not required for
generic mdio example. As result following warning will be produced:
 mdio.example.dt.yaml: mdio@5c030000: 'bus_freq' is a required property

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 Documentation/devicetree/bindings/net/mdio.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml
index 5d08d2ffd4eb..50c3397a82bc 100644
--- a/Documentation/devicetree/bindings/net/mdio.yaml
+++ b/Documentation/devicetree/bindings/net/mdio.yaml
@@ -56,7 +56,6 @@ patternProperties:
 examples:
   - |
     davinci_mdio: mdio@5c030000 {
-        compatible = "ti,davinci_mdio";
         reg = <0x5c030000 0x1000>;
         #address-cells = <1>;
         #size-cells = <0>;

From 303d37b4b0522150c0c7701d84934bc13199bebc Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Mon, 10 Feb 2020 11:04:16 +0100
Subject: [PATCH 080/239] dt-bindings: media: csi: Add interconnects properties

The Allwinner CSI controller is sitting beside the MBUS that is represented
as an interconnect.

Make sure that the interconnect properties are valid in the binding.

Fixes: 7866d6903ce8 ("media: dt-bindings: media: sun4i-csi: Add compatible for CSI0 on R40")
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 .../bindings/media/allwinner,sun4i-a10-csi.yaml        | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
index 9af873b43acd..afde17d9dab1 100644
--- a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
+++ b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
@@ -51,6 +51,16 @@ properties:
   resets:
     maxItems: 1
 
+  # FIXME: This should be made required eventually once every SoC will
+  # have the MBUS declared.
+  interconnects:
+    maxItems: 1
+
+  # FIXME: This should be made required eventually once every SoC will
+  # have the MBUS declared.
+  interconnect-names:
+    const: dma-mem
+
   # See ./video-interfaces.txt for details
   port:
     type: object

From 854bdbae9058bcf09b0add70b6047bc9ca776de2 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime@cerno.tech>
Date: Mon, 10 Feb 2020 11:04:17 +0100
Subject: [PATCH 081/239] dt-bindings: media: csi: Fix clocks description

Commit 1de243b07666 ("media: dt-bindings: media: sun4i-csi: Add compatible
for CSI1 on A10/A20") introduced support for the CSI1 controller on A10 and
A20 that unlike CSI0 doesn't have an ISP and therefore only have two
clocks, the bus and module clocks.

The clocks and clock-names properties have thus been modified to allow
either two or tree clocks. However, the current list has the ISP clock at
the second position, which means the bindings expects a list of either
bus and isp, or bus, isp and mod. The initial intent of the patch was
obviously to have bus and mod in the former case.

Let's fix the binding so that it validates properly.

Fixes: 1de243b07666 ("media: dt-bindings: media: sun4i-csi: Add compatible for CSI1 on A10/A20")
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 .../media/allwinner,sun4i-a10-csi.yaml        | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
index afde17d9dab1..8453ee340b9f 100644
--- a/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
+++ b/Documentation/devicetree/bindings/media/allwinner,sun4i-a10-csi.yaml
@@ -33,20 +33,26 @@ properties:
     maxItems: 1
 
   clocks:
-    minItems: 2
-    maxItems: 3
-    items:
-      - description: The CSI interface clock
-      - description: The CSI ISP clock
-      - description: The CSI DRAM clock
+    oneOf:
+      - items:
+        - description: The CSI interface clock
+        - description: The CSI DRAM clock
+
+      - items:
+        - description: The CSI interface clock
+        - description: The CSI ISP clock
+        - description: The CSI DRAM clock
 
   clock-names:
-    minItems: 2
-    maxItems: 3
-    items:
-      - const: bus
-      - const: isp
-      - const: ram
+    oneOf:
+      - items:
+        - const: bus
+        - const: ram
+
+      - items:
+        - const: bus
+        - const: isp
+        - const: ram
 
   resets:
     maxItems: 1

From badcd4546d52ae4318f2bcfda0e47a1394b60e38 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 19 Feb 2020 14:36:14 -0800
Subject: [PATCH 082/239] hwmon: (acpi_power_meter) Fix lockdep splat

Damien Le Moal reports a lockdep splat with the acpi_power_meter,
observed with Linux v5.5 and later.

======================================================
WARNING: possible circular locking dependency detected
5.6.0-rc2+ #629 Not tainted
------------------------------------------------------
python/1397 is trying to acquire lock:
ffff888619080070 (&resource->lock){+.+.}, at: show_power+0x3c/0xa0 [acpi_power_meter]

               but task is already holding lock:
ffff88881643f188 (kn->count#119){++++}, at: kernfs_seq_start+0x6a/0x160

               which lock already depends on the new lock.

               the existing dependency chain (in reverse order) is:

               -> #1 (kn->count#119){++++}:
       __kernfs_remove+0x626/0x7e0
       kernfs_remove_by_name_ns+0x41/0x80
       remove_attrs+0xcb/0x3c0 [acpi_power_meter]
       acpi_power_meter_notify+0x1f7/0x310 [acpi_power_meter]
       acpi_ev_notify_dispatch+0x198/0x1f3
       acpi_os_execute_deferred+0x4d/0x70
       process_one_work+0x7c8/0x1340
       worker_thread+0x94/0xc70
       kthread+0x2ed/0x3f0
       ret_from_fork+0x24/0x30

               -> #0 (&resource->lock){+.+.}:
       __lock_acquire+0x20be/0x49b0
       lock_acquire+0x127/0x340
       __mutex_lock+0x15b/0x1350
       show_power+0x3c/0xa0 [acpi_power_meter]
       dev_attr_show+0x3f/0x80
       sysfs_kf_seq_show+0x216/0x410
       seq_read+0x407/0xf90
       vfs_read+0x152/0x2c0
       ksys_read+0xf3/0x1d0
       do_syscall_64+0x95/0x1010
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

               other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(kn->count#119);
                               lock(&resource->lock);
                               lock(kn->count#119);
  lock(&resource->lock);

                *** DEADLOCK ***
4 locks held by python/1397:
 #0: ffff8890242d64e0 (&f->f_pos_lock){+.+.}, at: __fdget_pos+0x9b/0xb0
 #1: ffff889040be74e0 (&p->lock){+.+.}, at: seq_read+0x6b/0xf90
 #2: ffff8890448eb880 (&of->mutex){+.+.}, at: kernfs_seq_start+0x47/0x160
 #3: ffff88881643f188 (kn->count#119){++++}, at: kernfs_seq_start+0x6a/0x160

               stack backtrace:
CPU: 10 PID: 1397 Comm: python Not tainted 5.6.0-rc2+ #629
Hardware name: Supermicro Super Server/X11DPL-i, BIOS 3.1 05/21/2019
Call Trace:
 dump_stack+0x97/0xe0
 check_noncircular+0x32e/0x3e0
 ? print_circular_bug.isra.0+0x1e0/0x1e0
 ? unwind_next_frame+0xb9a/0x1890
 ? entry_SYSCALL_64_after_hwframe+0x49/0xbe
 ? graph_lock+0x79/0x170
 ? __lockdep_reset_lock+0x3c0/0x3c0
 ? mark_lock+0xbc/0x1150
 __lock_acquire+0x20be/0x49b0
 ? mark_held_locks+0xe0/0xe0
 ? stack_trace_save+0x91/0xc0
 lock_acquire+0x127/0x340
 ? show_power+0x3c/0xa0 [acpi_power_meter]
 ? device_remove_bin_file+0x10/0x10
 ? device_remove_bin_file+0x10/0x10
 __mutex_lock+0x15b/0x1350
 ? show_power+0x3c/0xa0 [acpi_power_meter]
 ? show_power+0x3c/0xa0 [acpi_power_meter]
 ? mutex_lock_io_nested+0x11f0/0x11f0
 ? lock_downgrade+0x6a0/0x6a0
 ? kernfs_seq_start+0x47/0x160
 ? lock_acquire+0x127/0x340
 ? kernfs_seq_start+0x6a/0x160
 ? device_remove_bin_file+0x10/0x10
 ? show_power+0x3c/0xa0 [acpi_power_meter]
 show_power+0x3c/0xa0 [acpi_power_meter]
 dev_attr_show+0x3f/0x80
 ? memset+0x20/0x40
 sysfs_kf_seq_show+0x216/0x410
 seq_read+0x407/0xf90
 ? security_file_permission+0x16f/0x2c0
 vfs_read+0x152/0x2c0

Problem is that reading an attribute takes the kernfs lock in the kernfs
code, then resource->lock in the driver. During an ACPI notification, the
opposite happens: The resource lock is taken first, followed by the kernfs
lock when sysfs attributes are removed and re-created. Presumably this is
now seen due to some locking related changes in kernfs after v5.4, but it
was likely always a problem.

Fix the problem by not blindly acquiring the lock in the notification
function. It is only needed to protect the various update functions.
However, those update functions are called anyway when sysfs attributes
are read. This means that we can just stop calling those functions from
the notifier, and the resource lock in the notifier function is no longer
needed.

That leaves two situations:

First, METER_NOTIFY_CONFIG removes and re-allocates capability strings.
While it did so under the resource lock, _displaying_ those strings was not
protected, creating a race condition. To solve this problem, selectively
protect both removal/creation and reporting of capability attributes with
the resource lock.

Second, removing and re-creating the attribute files is no longer protected
by the resource lock. That doesn't matter since access to each individual
attribute is protected by the kernfs lock. Userspace may get messed up if
attributes disappear and reappear under its nose, but that is not different
than today, and there is nothing we can do about it without major driver
restructuring.

Last but not least, when removing the driver, remove attribute functions
first, then release capability strings. This avoids yet another race
condition.

Reported-by: Damien Le Moal <Damien.LeMoal@wdc.com>
Cc: Damien Le Moal <Damien.LeMoal@wdc.com>
Cc: stable@vger.kernel.org # v5.5+
Tested-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/acpi_power_meter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c
index 4cf25458f0b9..0db8ef4fd6e1 100644
--- a/drivers/hwmon/acpi_power_meter.c
+++ b/drivers/hwmon/acpi_power_meter.c
@@ -355,7 +355,9 @@ static ssize_t show_str(struct device *dev,
 	struct acpi_device *acpi_dev = to_acpi_device(dev);
 	struct acpi_power_meter_resource *resource = acpi_dev->driver_data;
 	acpi_string val;
+	int ret;
 
+	mutex_lock(&resource->lock);
 	switch (attr->index) {
 	case 0:
 		val = resource->model_number;
@@ -372,8 +374,9 @@ static ssize_t show_str(struct device *dev,
 		val = "";
 		break;
 	}
-
-	return sprintf(buf, "%s\n", val);
+	ret = sprintf(buf, "%s\n", val);
+	mutex_unlock(&resource->lock);
+	return ret;
 }
 
 static ssize_t show_val(struct device *dev,
@@ -817,11 +820,12 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event)
 
 	resource = acpi_driver_data(device);
 
-	mutex_lock(&resource->lock);
 	switch (event) {
 	case METER_NOTIFY_CONFIG:
+		mutex_lock(&resource->lock);
 		free_capabilities(resource);
 		res = read_capabilities(resource);
+		mutex_unlock(&resource->lock);
 		if (res)
 			break;
 
@@ -830,15 +834,12 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event)
 		break;
 	case METER_NOTIFY_TRIP:
 		sysfs_notify(&device->dev.kobj, NULL, POWER_AVERAGE_NAME);
-		update_meter(resource);
 		break;
 	case METER_NOTIFY_CAP:
 		sysfs_notify(&device->dev.kobj, NULL, POWER_CAP_NAME);
-		update_cap(resource);
 		break;
 	case METER_NOTIFY_INTERVAL:
 		sysfs_notify(&device->dev.kobj, NULL, POWER_AVG_INTERVAL_NAME);
-		update_avg_interval(resource);
 		break;
 	case METER_NOTIFY_CAPPING:
 		sysfs_notify(&device->dev.kobj, NULL, POWER_ALARM_NAME);
@@ -848,7 +849,6 @@ static void acpi_power_meter_notify(struct acpi_device *device, u32 event)
 		WARN(1, "Unexpected event %d\n", event);
 		break;
 	}
-	mutex_unlock(&resource->lock);
 
 	acpi_bus_generate_netlink_event(ACPI_POWER_METER_CLASS,
 					dev_name(&device->dev), event, 0);
@@ -912,8 +912,8 @@ static int acpi_power_meter_remove(struct acpi_device *device)
 	resource = acpi_driver_data(device);
 	hwmon_device_unregister(resource->hwmon_dev);
 
-	free_capabilities(resource);
 	remove_attrs(resource);
+	free_capabilities(resource);
 
 	kfree(resource);
 	return 0;

From 35df4299a6487f323b0aca120ea3f485dfee2ae3 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Fri, 7 Feb 2020 09:29:11 -0500
Subject: [PATCH 083/239] ext4: fix a data race in EXT4_I(inode)->i_disksize

EXT4_I(inode)->i_disksize could be accessed concurrently as noticed by
KCSAN,

 BUG: KCSAN: data-race in ext4_write_end [ext4] / ext4_writepages [ext4]

 write to 0xffff91c6713b00f8 of 8 bytes by task 49268 on cpu 127:
  ext4_write_end+0x4e3/0x750 [ext4]
  ext4_update_i_disksize at fs/ext4/ext4.h:3032
  (inlined by) ext4_update_inode_size at fs/ext4/ext4.h:3046
  (inlined by) ext4_write_end at fs/ext4/inode.c:1287
  generic_perform_write+0x208/0x2a0
  ext4_buffered_write_iter+0x11f/0x210 [ext4]
  ext4_file_write_iter+0xce/0x9e0 [ext4]
  new_sync_write+0x29c/0x3b0
  __vfs_write+0x92/0xa0
  vfs_write+0x103/0x260
  ksys_write+0x9d/0x130
  __x64_sys_write+0x4c/0x60
  do_syscall_64+0x91/0xb47
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

 read to 0xffff91c6713b00f8 of 8 bytes by task 24872 on cpu 37:
  ext4_writepages+0x10ac/0x1d00 [ext4]
  mpage_map_and_submit_extent at fs/ext4/inode.c:2468
  (inlined by) ext4_writepages at fs/ext4/inode.c:2772
  do_writepages+0x5e/0x130
  __writeback_single_inode+0xeb/0xb20
  writeback_sb_inodes+0x429/0x900
  __writeback_inodes_wb+0xc4/0x150
  wb_writeback+0x4bd/0x870
  wb_workfn+0x6b4/0x960
  process_one_work+0x54c/0xbe0
  worker_thread+0x80/0x650
  kthread+0x1e0/0x200
  ret_from_fork+0x27/0x50

 Reported by Kernel Concurrency Sanitizer on:
 CPU: 37 PID: 24872 Comm: kworker/u261:2 Tainted: G        W  O L 5.5.0-next-20200204+ #5
 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019
 Workqueue: writeback wb_workfn (flush-7:0)

Since only the read is operating as lockless (outside of the
"i_data_sem"), load tearing could introduce a logic bug. Fix it by
adding READ_ONCE() for the read and WRITE_ONCE() for the write.

Signed-off-by: Qian Cai <cai@lca.pw>
Link: https://lore.kernel.org/r/1581085751-31793-1-git-send-email-cai@lca.pw
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h  | 2 +-
 fs/ext4/inode.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4441331d06cc..480badcf2783 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3032,7 +3032,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = newsize;
+		WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e60aca791d3f..6e1d81ed44ad 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2465,7 +2465,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 	 * truncate are avoided by checking i_size under i_data_sem.
 	 */
 	disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
-	if (disksize > EXT4_I(inode)->i_disksize) {
+	if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
 		int err2;
 		loff_t i_size;
 

From 9424ef56e13a1f14c57ea161eed3ecfdc7b2770e Mon Sep 17 00:00:00 2001
From: Shijie Luo <luoshijie1@huawei.com>
Date: Sat, 15 Feb 2020 03:02:06 -0500
Subject: [PATCH 084/239] ext4: add cond_resched() to __ext4_find_entry()

We tested a soft lockup problem in linux 4.19 which could also
be found in linux 5.x.

When dir inode takes up a large number of blocks, and if the
directory is growing when we are searching, it's possible the
restart branch could be called many times, and the do while loop
could hold cpu a long time.

Here is the call trace in linux 4.19.

[  473.756186] Call trace:
[  473.756196]  dump_backtrace+0x0/0x198
[  473.756199]  show_stack+0x24/0x30
[  473.756205]  dump_stack+0xa4/0xcc
[  473.756210]  watchdog_timer_fn+0x300/0x3e8
[  473.756215]  __hrtimer_run_queues+0x114/0x358
[  473.756217]  hrtimer_interrupt+0x104/0x2d8
[  473.756222]  arch_timer_handler_virt+0x38/0x58
[  473.756226]  handle_percpu_devid_irq+0x90/0x248
[  473.756231]  generic_handle_irq+0x34/0x50
[  473.756234]  __handle_domain_irq+0x68/0xc0
[  473.756236]  gic_handle_irq+0x6c/0x150
[  473.756238]  el1_irq+0xb8/0x140
[  473.756286]  ext4_es_lookup_extent+0xdc/0x258 [ext4]
[  473.756310]  ext4_map_blocks+0x64/0x5c0 [ext4]
[  473.756333]  ext4_getblk+0x6c/0x1d0 [ext4]
[  473.756356]  ext4_bread_batch+0x7c/0x1f8 [ext4]
[  473.756379]  ext4_find_entry+0x124/0x3f8 [ext4]
[  473.756402]  ext4_lookup+0x8c/0x258 [ext4]
[  473.756407]  __lookup_hash+0x8c/0xe8
[  473.756411]  filename_create+0xa0/0x170
[  473.756413]  do_mkdirat+0x6c/0x140
[  473.756415]  __arm64_sys_mkdirat+0x28/0x38
[  473.756419]  el0_svc_common+0x78/0x130
[  473.756421]  el0_svc_handler+0x38/0x78
[  473.756423]  el0_svc+0x8/0xc
[  485.755156] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [tmp:5149]

Add cond_resched() to avoid soft lockup and to provide a better
system responding.

Link: https://lore.kernel.org/r/20200215080206.13293-1-luoshijie1@huawei.com
Signed-off-by: Shijie Luo <luoshijie1@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ceff4b4b1877..b05ea72f38fd 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1511,6 +1511,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
 		/*
 		 * We deal with the read-ahead logic here.
 		 */
+		cond_resched();
 		if (ra_ptr >= ra_max) {
 			/* Refill the readahead buffer */
 			ra_ptr = 0;

From ce4a64e1f656138e2a1481049ea554720f86b43a Mon Sep 17 00:00:00 2001
From: Scott Branden <scott.branden@broadcom.com>
Date: Wed, 19 Feb 2020 14:14:03 -0800
Subject: [PATCH 085/239] docs: arm64: fix trivial spelling enought to enough
 in memory.rst

Fix trivial spelling error enought to enough in memory.rst.

Cc: trivial@kernel.org
Signed-off-by: Scott Branden <scott.branden@broadcom.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/memory.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
index 02e02175e6f5..cf03b3290800 100644
--- a/Documentation/arm64/memory.rst
+++ b/Documentation/arm64/memory.rst
@@ -129,7 +129,7 @@ this logic.
 
 As a single binary will need to support both 48-bit and 52-bit VA
 spaces, the VMEMMAP must be sized large enough for 52-bit VAs and
-also must be sized large enought to accommodate a fixed PAGE_OFFSET.
+also must be sized large enough to accommodate a fixed PAGE_OFFSET.
 
 Most code in the kernel should not need to consider the VA_BITS, for
 code that does need to know the VA size the variables are

From dcde237319e626d1ec3c9d8b7613032f0fd4663a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 19 Feb 2020 12:31:56 +0000
Subject: [PATCH 086/239] mm: Avoid creating virtual address aliases in
 brk()/mmap()/mremap()

Currently the arm64 kernel ignores the top address byte passed to brk(),
mmap() and mremap(). When the user is not aware of the 56-bit address
limit or relies on the kernel to return an error, untagging such
pointers has the potential to create address aliases in user-space.
Passing a tagged address to munmap(), madvise() is permitted since the
tagged pointer is expected to be inside an existing mapping.

The current behaviour breaks the existing glibc malloc() implementation
which relies on brk() with an address beyond 56-bit to be rejected by
the kernel.

Remove untagging in the above functions by partially reverting commit
ce18d171cb73 ("mm: untag user pointers in mmap/munmap/mremap/brk"). In
addition, update the arm64 tagged-address-abi.rst document accordingly.

Link: https://bugzilla.redhat.com/1797052
Fixes: ce18d171cb73 ("mm: untag user pointers in mmap/munmap/mremap/brk")
Cc: <stable@vger.kernel.org> # 5.4.x-
Cc: Florian Weimer <fweimer@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Victor Stinner <vstinner@redhat.com>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/tagged-address-abi.rst | 11 +++++++++--
 mm/mmap.c                                  |  4 ----
 mm/mremap.c                                |  1 -
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index d4a85d535bf9..4a9d9c794ee5 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -44,8 +44,15 @@ The AArch64 Tagged Address ABI has two stages of relaxation depending
 how the user addresses are used by the kernel:
 
 1. User addresses not accessed by the kernel but used for address space
-   management (e.g. ``mmap()``, ``mprotect()``, ``madvise()``). The use
-   of valid tagged pointers in this context is always allowed.
+   management (e.g. ``mprotect()``, ``madvise()``). The use of valid
+   tagged pointers in this context is allowed with the exception of
+   ``brk()``, ``mmap()`` and the ``new_address`` argument to
+   ``mremap()`` as these have the potential to alias with existing
+   user addresses.
+
+   NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+   incorrectly accept valid tagged pointers for the ``brk()``,
+   ``mmap()`` and ``mremap()`` system calls.
 
 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
    relaxation is disabled by default and the application thread needs to
diff --git a/mm/mmap.c b/mm/mmap.c
index 6756b8bb0033..d681a20eb4ea 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -195,8 +195,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	bool downgraded = false;
 	LIST_HEAD(uf);
 
-	brk = untagged_addr(brk);
-
 	if (down_write_killable(&mm->mmap_sem))
 		return -EINTR;
 
@@ -1557,8 +1555,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
 	struct file *file = NULL;
 	unsigned long retval;
 
-	addr = untagged_addr(addr);
-
 	if (!(flags & MAP_ANONYMOUS)) {
 		audit_mmap_fd(fd, flags);
 		file = fget(fd);
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..af363063ea23 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -607,7 +607,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	LIST_HEAD(uf_unmap);
 
 	addr = untagged_addr(addr);
-	new_addr = untagged_addr(new_addr);
 
 	if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
 		return ret;

From 68ca0fd272dac9a9f78fbf14b5e65de34f12c1b4 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Thu, 6 Feb 2020 08:11:39 +0000
Subject: [PATCH 087/239] selftest/lkdtm: Don't pollute 'git status'

Commit 46d1a0f03d66 ("selftests/lkdtm: Add tests for LKDTM targets")
added generation of lkdtm test scripts.

Ignore those generated scripts when performing 'git status'

Fixes: 46d1a0f03d66 ("selftests/lkdtm: Add tests for LKDTM targets")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index 72ef86a5570d..2763fce8766c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,6 +100,10 @@ modules.order
 /include/ksym/
 /arch/*/include/generated/
 
+# Generated lkdtm tests
+/tools/testing/selftests/lkdtm/*.sh
+!/tools/testing/selftests/lkdtm/run.sh
+
 # stgit generated dirs
 patches-*
 

From b9167c8078c3527de6da241c8a1a75a9224ed90a Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 20 Feb 2020 15:42:41 +1100
Subject: [PATCH 088/239] selftests: Install settings files to fix TIMEOUT
 failures

Commit 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second
timeout per test") added a 45 second timeout for tests, and also added
a way for tests to customise the timeout via a settings file.

For example the ftrace tests take multiple minutes to run, so they
were given longer in commit b43e78f65b1d ("tracing/selftests: Turn off
timeout setting").

This works when the tests are run from the source tree. However if the
tests are installed with "make -C tools/testing/selftests install",
the settings files are not copied into the install directory. When the
tests are then run from the install directory the longer timeouts are
not applied and the tests timeout incorrectly.

So add the settings files to TEST_FILES of the appropriate Makefiles
to cause the settings files to be installed using the existing install
logic.

Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/ftrace/Makefile    | 2 +-
 tools/testing/selftests/livepatch/Makefile | 2 ++
 tools/testing/selftests/net/mptcp/Makefile | 2 ++
 tools/testing/selftests/rseq/Makefile      | 2 ++
 tools/testing/selftests/rtc/Makefile       | 2 ++
 5 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
index cd1f5b3a7774..d6e106fbce11 100644
--- a/tools/testing/selftests/ftrace/Makefile
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -2,7 +2,7 @@
 all:
 
 TEST_PROGS := ftracetest
-TEST_FILES := test.d
+TEST_FILES := test.d settings
 EXTRA_CLEAN := $(OUTPUT)/logs/*
 
 include ../lib.mk
diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile
index 3876d8d62494..1acc9e1fa3fb 100644
--- a/tools/testing/selftests/livepatch/Makefile
+++ b/tools/testing/selftests/livepatch/Makefile
@@ -8,4 +8,6 @@ TEST_PROGS := \
 	test-state.sh \
 	test-ftrace.sh
 
+TEST_FILES := settings
+
 include ../lib.mk
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
index 93de52016dde..ba450e62dc5b 100644
--- a/tools/testing/selftests/net/mptcp/Makefile
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -8,6 +8,8 @@ TEST_PROGS := mptcp_connect.sh
 
 TEST_GEN_FILES = mptcp_connect
 
+TEST_FILES := settings
+
 EXTRA_CLEAN := *.pcap
 
 include ../../lib.mk
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index d6469535630a..f1053630bb6f 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -19,6 +19,8 @@ TEST_GEN_PROGS_EXTENDED = librseq.so
 
 TEST_PROGS = run_param_test.sh
 
+TEST_FILES := settings
+
 include ../lib.mk
 
 $(OUTPUT)/librseq.so: rseq.c rseq.h rseq-*.h
diff --git a/tools/testing/selftests/rtc/Makefile b/tools/testing/selftests/rtc/Makefile
index 2d93d65723c9..55198ecc04db 100644
--- a/tools/testing/selftests/rtc/Makefile
+++ b/tools/testing/selftests/rtc/Makefile
@@ -6,4 +6,6 @@ TEST_GEN_PROGS = rtctest
 
 TEST_GEN_PROGS_EXTENDED = setdate
 
+TEST_FILES := settings
+
 include ../lib.mk

From ef89d0545132d685f73da6f58b7e7fe002536f91 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 20 Feb 2020 22:37:48 +1100
Subject: [PATCH 089/239] selftests/rseq: Fix out-of-tree compilation

Currently if you build with O=... the rseq tests don't build:

  $ make O=$PWD/output -C tools/testing/selftests/ TARGETS=rseq
  make: Entering directory '/linux/tools/testing/selftests'
  ...
  make[1]: Entering directory '/linux/tools/testing/selftests/rseq'
  gcc -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./  -shared -fPIC rseq.c -lpthread -o /linux/output/rseq/librseq.so
  gcc -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./  basic_test.c -lpthread -lrseq -o /linux/output/rseq/basic_test
  /usr/bin/ld: cannot find -lrseq
  collect2: error: ld returned 1 exit status

This is because the library search path points to the source
directory, not the output.

We can fix it by changing the library search path to $(OUTPUT).

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 tools/testing/selftests/rseq/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
index f1053630bb6f..2af9d39a9716 100644
--- a/tools/testing/selftests/rseq/Makefile
+++ b/tools/testing/selftests/rseq/Makefile
@@ -4,7 +4,7 @@ ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
 CLANG_FLAGS += -no-integrated-as
 endif
 
-CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L./ -Wl,-rpath=./ \
+CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L$(OUTPUT) -Wl,-rpath=./ \
 	  $(CLANG_FLAGS)
 LDLIBS += -lpthread
 

From 9038ec99ceb94fb8d93ade5e236b2928f0792c7c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 19 Feb 2020 22:23:18 -0800
Subject: [PATCH 090/239] x86/xen: Distribute switch variables for
 initialization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Variables declared in a switch statement before any case statements
cannot be automatically initialized with compiler instrumentation (as
they are not part of any execution flow). With GCC's proposed automatic
stack variable initialization feature, this triggers a warning (and they
don't get initialized). Clang's automatic stack variable initialization
(via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also
doesn't initialize such variables[1]. Note that these warnings (or silent
skipping) happen before the dead-store elimination optimization phase,
so even when the automatic initializations are later elided in favor of
direct initializations, the warnings remain.

To avoid these problems, move such variables into the "case" where
they're used or lift them up into the main function body.

arch/x86/xen/enlighten_pv.c: In function ‘xen_write_msr_safe’:
arch/x86/xen/enlighten_pv.c:904:12: warning: statement will never be executed [-Wswitch-unreachable]
  904 |   unsigned which;
      |            ^~~~~

[1] https://bugs.llvm.org/show_bug.cgi?id=44916

Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20200220062318.69299-1-keescook@chromium.org
Reviewed-by: Juergen Gross <jgross@suse.com>
[boris: made @which an 'unsigned int']
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/enlighten_pv.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ae4a41ca19f6..3fbbc50bb032 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -896,14 +896,15 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
+#ifdef CONFIG_X86_64
+	unsigned int which;
+	u64 base;
+#endif
 
 	ret = 0;
 
 	switch (msr) {
 #ifdef CONFIG_X86_64
-		unsigned which;
-		u64 base;
-
 	case MSR_FS_BASE:		which = SEGBASE_FS; goto set;
 	case MSR_KERNEL_GS_BASE:	which = SEGBASE_GS_USER; goto set;
 	case MSR_GS_BASE:		which = SEGBASE_GS_KERNEL; goto set;

From 147f1a1fe5d7e6b01b8df4d0cbd6f9eaf6b6c73b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 13 Feb 2020 18:24:48 +0100
Subject: [PATCH 091/239] KVM: x86: fix incorrect comparison in trace event

The "u" field in the event has three states, -1/0/1.  Using u8 however means that
comparison with -1 will always fail, so change to signed char.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmutrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3c6522b84ff1..ffcd96fc02d0 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -339,7 +339,7 @@ TRACE_EVENT(
 		/* These depend on page entry type, so compute them now.  */
 		__field(bool, r)
 		__field(bool, x)
-		__field(u8, u)
+		__field(signed char, u)
 	),
 
 	TP_fast_assign(

From b78a8552d77f8efb7c4fbd92a91b890c32b89528 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Mon, 17 Feb 2020 11:48:26 -0500
Subject: [PATCH 092/239] kvm/emulate: fix a -Werror=cast-function-type

arch/x86/kvm/emulate.c: In function 'x86_emulate_insn':
arch/x86/kvm/emulate.c:5686:22: error: cast between incompatible
function types from 'int (*)(struct x86_emulate_ctxt *)' to 'void
(*)(struct fastop *)' [-Werror=cast-function-type]
    rc = fastop(ctxt, (fastop_t)ctxt->execute);

Fix it by using an unnamed union of a (*execute) function pointer and a
(*fastop) function pointer.

Fixes: 3009afc6e39e ("KVM: x86: Use a typedef for fastop functions")
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 13 ++++++++++-
 arch/x86/kvm/emulate.c             | 36 ++++++++++++------------------
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 03946eb3e2b9..2a8f2bd2e5cf 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -292,6 +292,14 @@ enum x86emul_mode {
 #define X86EMUL_SMM_MASK             (1 << 6)
 #define X86EMUL_SMM_INSIDE_NMI_MASK  (1 << 7)
 
+/*
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
+
+typedef void (*fastop_t)(struct fastop *);
+
 struct x86_emulate_ctxt {
 	const struct x86_emulate_ops *ops;
 
@@ -324,7 +332,10 @@ struct x86_emulate_ctxt {
 	struct operand src;
 	struct operand src2;
 	struct operand dst;
-	int (*execute)(struct x86_emulate_ctxt *ctxt);
+	union {
+		int (*execute)(struct x86_emulate_ctxt *ctxt);
+		fastop_t fop;
+	};
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
 	/*
 	 * The following six fields are cleared together,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ddbc61984227..dd19fb3539e0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -191,25 +191,6 @@
 #define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
 #define FASTOP_SIZE 8
 
-/*
- * fastop functions have a special calling convention:
- *
- * dst:    rax        (in/out)
- * src:    rdx        (in/out)
- * src2:   rcx        (in)
- * flags:  rflags     (in/out)
- * ex:     rsi        (in:fastop pointer, out:zero if exception)
- *
- * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
- * different operand sizes can be reached by calculation, rather than a jump
- * table (which would be bigger than the code).
- *
- * fastop functions are declared as taking a never-defined fastop parameter,
- * so they can't be called from C directly.
- */
-
-struct fastop;
-
 struct opcode {
 	u64 flags : 56;
 	u64 intercept : 8;
@@ -311,8 +292,19 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
 #define ON64(x)
 #endif
 
-typedef void (*fastop_t)(struct fastop *);
-
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst:    rax        (in/out)
+ * src:    rdx        (in/out)
+ * src2:   rcx        (in)
+ * flags:  rflags     (in/out)
+ * ex:     rsi        (in:fastop pointer, out:zero if exception)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ */
 static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
 
 #define __FOP_FUNC(name) \
@@ -5683,7 +5675,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 	if (ctxt->execute) {
 		if (ctxt->d & Fastop)
-			rc = fastop(ctxt, (fastop_t)ctxt->execute);
+			rc = fastop(ctxt, ctxt->fop);
 		else
 			rc = ctxt->execute(ctxt);
 		if (rc != X86EMUL_CONTINUE)

From 279eef0531928a6669230879a6eed081513ad5a3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 14 Feb 2020 16:56:38 -0600
Subject: [PATCH 093/239] tracing: Make sure synth_event_trace() example always
 uses u64

synth_event_trace() is the varargs version of synth_event_trace_array(),
which takes an array of u64, as do synth_event_add_val() et al.

To not only be consistent with those, but also to address the fact
that synth_event_trace() expects every arg to be of the same type
since it doesn't also pass in e.g. a format string, the caller needs
to make sure all args are of the same type, u64.  u64 is used because
it needs to accomodate the largest type available in synthetic events,
which is u64.

This fixes the bug reported by the kernel test robot/Rong Chen.

Link: https://lore.kernel.org/lkml/20200212113444.GS12867@shao2-debian/
Link: http://lkml.kernel.org/r/894c4e955558b521210ee0642ba194a9e603354c.1581720155.git.zanussi@kernel.org

Fixes: 9fe41efaca084 ("tracing: Add synth event generation test module")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/synth_event_gen_test.c | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 4aefe003cb7c..6866280a9b10 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void)
 	/* Create some bogus values just for testing */
 
 	vals[0] = 777;			/* next_pid_field */
-	vals[1] = (u64)"hula hoops";	/* next_comm_field */
+	vals[1] = (u64)(long)"hula hoops";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
 	vals[4] = smp_processor_id();	/* cpu */
-	vals[5] = (u64)"thneed";	/* my_string_field */
+	vals[5] = (u64)(long)"thneed";	/* my_string_field */
 	vals[6] = 598;			/* my_int_field */
 
 	/* Now generate a gen_synth_test event */
@@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void)
 	/* Create some bogus values just for testing */
 
 	vals[0] = 777;			/* next_pid_field */
-	vals[1] = (u64)"tiddlywinks";	/* next_comm_field */
+	vals[1] = (u64)(long)"tiddlywinks";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
 	vals[4] = smp_processor_id();	/* cpu */
-	vals[5] = (u64)"thneed_2.0";	/* my_string_field */
+	vals[5] = (u64)(long)"thneed_2.0";	/* my_string_field */
 	vals[6] = 399;			/* my_int_field */
 
 	/* Now trace an empty_synth_test event */
@@ -290,11 +290,11 @@ static int __init test_create_synth_event(void)
 	/* Create some bogus values just for testing */
 
 	vals[0] = 777;			/* next_pid_field */
-	vals[1] = (u64)"tiddlywinks";	/* next_comm_field */
+	vals[1] = (u64)(long)"tiddlywinks";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
 	vals[4] = smp_processor_id();	/* cpu */
-	vals[5] = (u64)"thneed";	/* my_string_field */
+	vals[5] = (u64)(long)"thneed";	/* my_string_field */
 	vals[6] = 398;			/* my_int_field */
 
 	/* Now generate a create_synth_test event */
@@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void)
 		goto out;
 
 	/* next_comm_field */
-	ret = synth_event_add_next_val((u64)"slinky", &trace_state);
+	ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state);
 	if (ret)
 		goto out;
 
@@ -350,7 +350,7 @@ static int __init test_add_next_synth_val(void)
 		goto out;
 
 	/* my_string_field */
-	ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state);
+	ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state);
 	if (ret)
 		goto out;
 
@@ -396,12 +396,12 @@ static int __init test_add_synth_val(void)
 	if (ret)
 		goto out;
 
-	ret = synth_event_add_val("next_comm_field", (u64)"silly putty",
+	ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty",
 				  &trace_state);
 	if (ret)
 		goto out;
 
-	ret = synth_event_add_val("my_string_field", (u64)"thneed_9",
+	ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9",
 				  &trace_state);
 	if (ret)
 		goto out;
@@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void)
 
 	/* Trace some bogus values just for testing */
 	ret = synth_event_trace(create_synth_test, 7,	/* number of values */
-				444,			/* next_pid_field */
-				(u64)"clackers",	/* next_comm_field */
-				1000000,		/* ts_ns */
-				1000,			/* ts_ms */
-				smp_processor_id(),	/* cpu */
-				(u64)"Thneed",		/* my_string_field */
-				999);			/* my_int_field */
+				(u64)444,		/* next_pid_field */
+				(u64)(long)"clackers",	/* next_comm_field */
+				(u64)1000000,		/* ts_ns */
+				(u64)1000,		/* ts_ms */
+				(u64)smp_processor_id(),/* cpu */
+				(u64)(long)"Thneed",	/* my_string_field */
+				(u64)999);		/* my_int_field */
 	return ret;
 }
 

From 1d9d4c90194a8c3b2f7da9f4bf3f8ba2ed810656 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 14 Feb 2020 16:56:39 -0600
Subject: [PATCH 094/239] tracing: Make synth_event trace functions
 endian-correct

synth_event_trace(), synth_event_trace_array() and
__synth_event_add_val() write directly into the trace buffer and need
to take endianness into account, like trace_event_raw_event_synth()
does.

Link: http://lkml.kernel.org/r/2011354355e405af9c9d28abba430d1f5ff7771a.1581720155.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 62 +++++++++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 65b54d6a1422..6a380fb83864 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1891,7 +1891,25 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
 			strscpy(str_field, str_val, STR_VAR_LEN_MAX);
 			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
 		} else {
-			state.entry->fields[n_u64] = val;
+			struct synth_field *field = state.event->fields[i];
+
+			switch (field->size) {
+			case 1:
+				*(u8 *)&state.entry->fields[n_u64] = (u8)val;
+				break;
+
+			case 2:
+				*(u16 *)&state.entry->fields[n_u64] = (u16)val;
+				break;
+
+			case 4:
+				*(u32 *)&state.entry->fields[n_u64] = (u32)val;
+				break;
+
+			default:
+				state.entry->fields[n_u64] = val;
+				break;
+			}
 			n_u64++;
 		}
 	}
@@ -1943,7 +1961,26 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
 			strscpy(str_field, str_val, STR_VAR_LEN_MAX);
 			n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
 		} else {
-			state.entry->fields[n_u64] = vals[i];
+			struct synth_field *field = state.event->fields[i];
+			u64 val = vals[i];
+
+			switch (field->size) {
+			case 1:
+				*(u8 *)&state.entry->fields[n_u64] = (u8)val;
+				break;
+
+			case 2:
+				*(u16 *)&state.entry->fields[n_u64] = (u16)val;
+				break;
+
+			case 4:
+				*(u32 *)&state.entry->fields[n_u64] = (u32)val;
+				break;
+
+			default:
+				state.entry->fields[n_u64] = val;
+				break;
+			}
 			n_u64++;
 		}
 	}
@@ -2062,8 +2099,25 @@ static int __synth_event_add_val(const char *field_name, u64 val,
 
 		str_field = (char *)&entry->fields[field->offset];
 		strscpy(str_field, str_val, STR_VAR_LEN_MAX);
-	} else
-		entry->fields[field->offset] = val;
+	} else {
+		switch (field->size) {
+		case 1:
+			*(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+			break;
+
+		case 2:
+			*(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+			break;
+
+		case 4:
+			*(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+			break;
+
+		default:
+			trace_state->entry->fields[field->offset] = val;
+			break;
+		}
+	}
  out:
 	return ret;
 }

From 3843083772dc2afde790a6d7160658b00a808da1 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 14 Feb 2020 16:56:40 -0600
Subject: [PATCH 095/239] tracing: Check that number of vals matches number of
 synth event fields

Commit 7276531d4036('tracing: Consolidate trace() functions')
inadvertently dropped the synth_event_trace() and
synth_event_trace_array() checks that verify the number of values
passed in matches the number of fields in the synthetic event being
traced, so add them back.

Link: http://lkml.kernel.org/r/32819cac708714693669e0dfe10fe9d935e94a16.1581720155.git.zanussi@kernel.org

Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 6a380fb83864..45622194a34d 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1878,6 +1878,11 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
 		return ret;
 	}
 
+	if (n_vals != state.event->n_fields) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	va_start(args, n_vals);
 	for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
 		u64 val;
@@ -1914,7 +1919,7 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
 		}
 	}
 	va_end(args);
-
+out:
 	__synth_event_trace_end(&state);
 
 	return ret;
@@ -1953,6 +1958,11 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
 		return ret;
 	}
 
+	if (n_vals != state.event->n_fields) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
 		if (state.event->fields[i]->is_string) {
 			char *str_val = (char *)(long)vals[i];
@@ -1984,7 +1994,7 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
 			n_u64++;
 		}
 	}
-
+out:
 	__synth_event_trace_end(&state);
 
 	return ret;

From 784bd0847eda032ed2f3522f87250655a18c0190 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <zanussi@kernel.org>
Date: Fri, 14 Feb 2020 16:56:41 -0600
Subject: [PATCH 096/239] tracing: Fix number printing bug in
 print_synth_event()

Fix a varargs-related bug in print_synth_event() which resulted in
strange output and oopses on 32-bit x86 systems. The problem is that
trace_seq_printf() expects the varargs to match the format string, but
print_synth_event() was always passing u64 values regardless.  This
results in unspecified behavior when unpacking with va_arg() in
trace_seq_printf().

Add a function that takes the size into account when calling
trace_seq_printf().

Before:

  modprobe-1731  [003] ....   919.039758: gen_synth_test: next_pid_field=777(null)next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=3(null)my_string_field=thneed my_int_field=598(null)

After:

 insmod-1136  [001] ....    36.634590: gen_synth_test: next_pid_field=777 next_comm_field=hula hoops ts_ns=1000000 ts_ms=1000 cpu=1 my_string_field=thneed my_int_field=598

Link: http://lkml.kernel.org/r/a9b59eb515dbbd7d4abe53b347dccf7a8e285657.1581720155.git.zanussi@kernel.org

Reported-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 45622194a34d..f068d55bd37f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -820,6 +820,29 @@ static const char *synth_field_fmt(char *type)
 	return fmt;
 }
 
+static void print_synth_event_num_val(struct trace_seq *s,
+				      char *print_fmt, char *name,
+				      int size, u64 val, char *space)
+{
+	switch (size) {
+	case 1:
+		trace_seq_printf(s, print_fmt, name, (u8)val, space);
+		break;
+
+	case 2:
+		trace_seq_printf(s, print_fmt, name, (u16)val, space);
+		break;
+
+	case 4:
+		trace_seq_printf(s, print_fmt, name, (u32)val, space);
+		break;
+
+	default:
+		trace_seq_printf(s, print_fmt, name, val, space);
+		break;
+	}
+}
+
 static enum print_line_t print_synth_event(struct trace_iterator *iter,
 					   int flags,
 					   struct trace_event *event)
@@ -858,10 +881,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
 		} else {
 			struct trace_print_flags __flags[] = {
 			    __def_gfpflag_names, {-1, NULL} };
+			char *space = (i == se->n_fields - 1 ? "" : " ");
 
-			trace_seq_printf(s, print_fmt, se->fields[i]->name,
-					 entry->fields[n_u64],
-					 i == se->n_fields - 1 ? "" : " ");
+			print_synth_event_num_val(s, print_fmt,
+						  se->fields[i]->name,
+						  se->fields[i]->size,
+						  entry->fields[n_u64],
+						  space);
 
 			if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
 				trace_seq_puts(s, " (");

From 8645e56a4ad6dcbf504872db7f14a2f67db88ef2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Feb 2020 18:30:26 +0100
Subject: [PATCH 097/239] xen: Enable interrupts when calling _cond_resched()

xen_maybe_preempt_hcall() is called from the exception entry point
xen_do_hypervisor_callback with interrupts disabled.

_cond_resched() evades the might_sleep() check in cond_resched() which
would have caught that and schedule_debug() unfortunately lacks a check
for irqs_disabled().

Enable interrupts around the call and use cond_resched() to catch future
issues.

Fixes: fdfd811ddde3 ("x86/xen: allow privcmd hypercalls to be preempted")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/878skypjrh.fsf@nanos.tec.linutronix.de
Reviewed-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/preempt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index 8b9919c26095..456a164364a2 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -33,7 +33,9 @@ asmlinkage __visible void xen_maybe_preempt_hcall(void)
 		 * cpu.
 		 */
 		__this_cpu_write(xen_in_preemptible_hcall, false);
-		_cond_resched();
+		local_irq_enable();
+		cond_resched();
+		local_irq_disable();
 		__this_cpu_write(xen_in_preemptible_hcall, true);
 	}
 }

From 3c18a9be7c9d4f53239795282c5d927f73f534b3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 20 Feb 2020 16:29:50 -0500
Subject: [PATCH 098/239] tracing: Have synthetic event test use
 raw_smp_processor_id()

The test code that tests synthetic event creation pushes in as one of its
test fields the current CPU using "smp_processor_id()". As this is just
something to see if the value is correctly passed in, and the actual CPU
used does not matter, use raw_smp_processor_id(), otherwise with debug
preemption enabled, a warning happens as the smp_processor_id() is called
without preemption enabled.

Link: http://lkml.kernel.org/r/20200220162950.35162579@gandalf.local.home

Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/synth_event_gen_test.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 6866280a9b10..7d56d621ffea 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -114,7 +114,7 @@ static int __init test_gen_synth_cmd(void)
 	vals[1] = (u64)(long)"hula hoops";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
-	vals[4] = smp_processor_id();	/* cpu */
+	vals[4] = raw_smp_processor_id(); /* cpu */
 	vals[5] = (u64)(long)"thneed";	/* my_string_field */
 	vals[6] = 598;			/* my_int_field */
 
@@ -221,7 +221,7 @@ static int __init test_empty_synth_event(void)
 	vals[1] = (u64)(long)"tiddlywinks";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
-	vals[4] = smp_processor_id();	/* cpu */
+	vals[4] = raw_smp_processor_id(); /* cpu */
 	vals[5] = (u64)(long)"thneed_2.0";	/* my_string_field */
 	vals[6] = 399;			/* my_int_field */
 
@@ -293,7 +293,7 @@ static int __init test_create_synth_event(void)
 	vals[1] = (u64)(long)"tiddlywinks";	/* next_comm_field */
 	vals[2] = 1000000;		/* ts_ns */
 	vals[3] = 1000;			/* ts_ms */
-	vals[4] = smp_processor_id();	/* cpu */
+	vals[4] = raw_smp_processor_id(); /* cpu */
 	vals[5] = (u64)(long)"thneed";	/* my_string_field */
 	vals[6] = 398;			/* my_int_field */
 
@@ -345,7 +345,7 @@ static int __init test_add_next_synth_val(void)
 		goto out;
 
 	/* cpu */
-	ret = synth_event_add_next_val(smp_processor_id(), &trace_state);
+	ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state);
 	if (ret)
 		goto out;
 
@@ -388,7 +388,7 @@ static int __init test_add_synth_val(void)
 	if (ret)
 		goto out;
 
-	ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state);
+	ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state);
 	if (ret)
 		goto out;
 
@@ -427,7 +427,7 @@ static int __init test_trace_synth_event(void)
 				(u64)(long)"clackers",	/* next_comm_field */
 				(u64)1000000,		/* ts_ns */
 				(u64)1000,		/* ts_ms */
-				(u64)smp_processor_id(),/* cpu */
+				(u64)raw_smp_processor_id(), /* cpu */
 				(u64)(long)"Thneed",	/* my_string_field */
 				(u64)999);		/* my_int_field */
 	return ret;

From 78041c0c9e935d9ce4086feeff6c569ed88ddfd4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 20 Feb 2020 15:38:01 -0500
Subject: [PATCH 099/239] tracing: Disable trace_printk() on post poned tests

The tracing seftests checks various aspects of the tracing infrastructure,
and one is filtering. If trace_printk() is active during a self test, it can
cause the filtering to fail, which will disable that part of the trace.

To keep the selftests from failing because of trace_printk() calls,
trace_printk() checks the variable tracing_selftest_running, and if set, it
does not write to the tracing buffer.

As some tracers were registered earlier in boot, the selftest they triggered
would fail because not all the infrastructure was set up for the full
selftest. Thus, some of the tests were post poned to when their
infrastructure was ready (namely file system code). The postpone code did
not set the tracing_seftest_running variable, and could fail if a
trace_printk() was added and executed during their run.

Cc: stable@vger.kernel.org
Fixes: 9afecfbb95198 ("tracing: Postpone tracer start-up tests till the system is more robust")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 183b031a3828..a89c562ffb8f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void)
 
 	pr_info("Running postponed tracer tests:\n");
 
+	tracing_selftest_running = true;
 	list_for_each_entry_safe(p, n, &postponed_selftests, list) {
 		/* This loop can take minutes when sanitizers are enabled, so
 		 * lets make sure we allow RCU processing.
@@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void)
 		list_del(&p->list);
 		kfree(p);
 	}
+	tracing_selftest_running = false;
 
  out:
 	mutex_unlock(&trace_types_lock);

From 08d9e686426f7557d3f1cda219ff907397c89d53 Mon Sep 17 00:00:00 2001
From: Qiujun Huang <hqjagain@gmail.com>
Date: Sun, 16 Feb 2020 19:28:31 +0800
Subject: [PATCH 100/239] bootconfig: Mark boot_config_checksum() static

In fact, this function is only used in this file, so mark it with 'static'.

Link: http://lkml.kernel.org/r/1581852511-14163-1-git-send-email-hqjagain@gmail.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Qiujun Huang <hqjagain@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 init/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/main.c b/init/main.c
index 59248717c925..48c87f47a444 100644
--- a/init/main.c
+++ b/init/main.c
@@ -335,7 +335,7 @@ static char * __init xbc_make_cmdline(const char *key)
 	return new_cmdline;
 }
 
-u32 boot_config_checksum(unsigned char *p, u32 size)
+static u32 boot_config_checksum(unsigned char *p, u32 size)
 {
 	u32 ret = 0;
 

From 7ab215f22d04067094de8c81c20ba4c565ff8dd4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Mon, 17 Feb 2020 18:52:39 +0900
Subject: [PATCH 101/239] tracing: Clear trace_state when starting trace

Clear trace_state data structure when starting trace
in __synth_event_trace_start() internal function.

Currently trace_state is initialized only in the
synth_event_trace_start() API, but the trace_state
in synth_event_trace() and synth_event_trace_array()
are on the stack without initialization.
This means those APIs will see wrong parameters and
wil skip closing process in __synth_event_trace_end()
because trace_state->disabled may be !0.

Link: http://lkml.kernel.org/r/158193315899.8868.1781259176894639952.stgit@devnote2

Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f068d55bd37f..9d87aa1f0b79 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1824,6 +1824,8 @@ __synth_event_trace_start(struct trace_event_file *file,
 	int entry_size, fields_size = 0;
 	int ret = 0;
 
+	memset(trace_state, '\0', sizeof(*trace_state));
+
 	/*
 	 * Normal event tracing doesn't get called at all unless the
 	 * ENABLED bit is set (which attaches the probe thus allowing
@@ -2063,8 +2065,6 @@ int synth_event_trace_start(struct trace_event_file *file,
 	if (!trace_state)
 		return -EINVAL;
 
-	memset(trace_state, '\0', sizeof(*trace_state));
-
 	ret = __synth_event_trace_start(file, trace_state);
 	if (ret == -ENOENT)
 		ret = 0; /* just disabled, not really an error */

From d8a953ddde5ec30a36810d0a892c3949b50849e9 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:18:33 +0900
Subject: [PATCH 102/239] bootconfig: Set CONFIG_BOOT_CONFIG=n by default

Set CONFIG_BOOT_CONFIG=n by default. This also warns
user if CONFIG_BOOT_CONFIG=n but "bootconfig" is given
in the kernel command line.

Link: http://lkml.kernel.org/r/158220111291.26565.9036889083940367969.stgit@devnote2

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 init/Kconfig         | 1 -
 init/main.c          | 8 ++++++++
 kernel/trace/Kconfig | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 4a672c6629d0..f586878410d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1218,7 +1218,6 @@ endif
 config BOOT_CONFIG
 	bool "Boot config support"
 	depends on BLK_DEV_INITRD
-	default y
 	help
 	  Extra boot config allows system admin to pass a config file as
 	  complemental extension of kernel cmdline when booting.
diff --git a/init/main.c b/init/main.c
index 48c87f47a444..d96cc5f65022 100644
--- a/init/main.c
+++ b/init/main.c
@@ -418,6 +418,14 @@ static void __init setup_boot_config(const char *cmdline)
 }
 #else
 #define setup_boot_config(cmdline)	do { } while (0)
+
+static int __init warn_bootconfig(char *str)
+{
+	pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOTCONFIG is not set.\n");
+	return 0;
+}
+early_param("bootconfig", warn_bootconfig);
+
 #endif
 
 /* Change NUL term back to "=", to make "param" the whole string. */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 91e885194dbc..795c3e02d3f1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -143,7 +143,8 @@ if FTRACE
 
 config BOOTTIME_TRACING
 	bool "Boot-time Tracing support"
-	depends on BOOT_CONFIG && TRACING
+	depends on TRACING
+	select BOOT_CONFIG
 	default y
 	help
 	  Enable developer to setup ftrace subsystem via supplemental

From 85c46b78da58398be1c5166f55063c0512decd39 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:18:42 +0900
Subject: [PATCH 103/239] bootconfig: Add bootconfig magic word for indicating
 bootconfig explicitly

Add bootconfig magic word to the end of bootconfig on initrd
image for indicating explicitly the bootconfig is there.
Also tools/bootconfig treats wrong size or wrong checksum or
parse error as an error, because if there is a bootconfig magic
word, there must be a bootconfig.

The bootconfig magic word is "#BOOTCONFIG\n", 12 bytes word.
Thus the block image of the initrd file with bootconfig is
as follows.

[Initrd][bootconfig][size][csum][#BOOTCONFIG\n]

Link: http://lkml.kernel.org/r/158220112263.26565.3944814205960612841.stgit@devnote2

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/admin-guide/bootconfig.rst | 10 ++++--
 include/linux/bootconfig.h               |  3 ++
 init/Kconfig                             |  2 +-
 init/main.c                              |  6 +++-
 tools/bootconfig/main.c                  | 43 ++++++++++++++++++------
 tools/bootconfig/test-bootconfig.sh      |  2 +-
 6 files changed, 49 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index b342a6796392..5e7609936507 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -102,9 +102,13 @@ Boot Kernel With a Boot Config
 ==============================
 
 Since the boot configuration file is loaded with initrd, it will be added
-to the end of the initrd (initramfs) image file. The Linux kernel decodes
-the last part of the initrd image in memory to get the boot configuration
-data.
+to the end of the initrd (initramfs) image file with size, checksum and
+12-byte magic word as below.
+
+[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n]
+
+The Linux kernel decodes the last part of the initrd image in memory to
+get the boot configuration data.
 Because of this "piggyback" method, there is no need to change or
 update the boot loader and the kernel image itself.
 
diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index 7e18c939663e..d11e183fcb54 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -10,6 +10,9 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#define BOOTCONFIG_MAGIC	"#BOOTCONFIG\n"
+#define BOOTCONFIG_MAGIC_LEN	12
+
 /* XBC tree node */
 struct xbc_node {
 	u16 next;
diff --git a/init/Kconfig b/init/Kconfig
index f586878410d2..a84e7aa89a29 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1222,7 +1222,7 @@ config BOOT_CONFIG
 	  Extra boot config allows system admin to pass a config file as
 	  complemental extension of kernel cmdline when booting.
 	  The boot config file must be attached at the end of initramfs
-	  with checksum and size.
+	  with checksum, size and magic word.
 	  See <file:Documentation/admin-guide/bootconfig.rst> for details.
 
 	  If unsure, say Y.
diff --git a/init/main.c b/init/main.c
index d96cc5f65022..2fe8dec93e68 100644
--- a/init/main.c
+++ b/init/main.c
@@ -374,7 +374,11 @@ static void __init setup_boot_config(const char *cmdline)
 	if (!initrd_end)
 		goto not_found;
 
-	hdr = (u32 *)(initrd_end - 8);
+	data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN;
+	if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN))
+		goto not_found;
+
+	hdr = (u32 *)(data - 8);
 	size = hdr[0];
 	csum = hdr[1];
 
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index e18eeb070562..742271f019a9 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -131,15 +131,26 @@ int load_xbc_from_initrd(int fd, char **buf)
 	struct stat stat;
 	int ret;
 	u32 size = 0, csum = 0, rcsum;
+	char magic[BOOTCONFIG_MAGIC_LEN];
 
 	ret = fstat(fd, &stat);
 	if (ret < 0)
 		return -errno;
 
-	if (stat.st_size < 8)
+	if (stat.st_size < 8 + BOOTCONFIG_MAGIC_LEN)
 		return 0;
 
-	if (lseek(fd, -8, SEEK_END) < 0) {
+	if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) {
+		pr_err("Failed to lseek: %d\n", -errno);
+		return -errno;
+	}
+	if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0)
+		return -errno;
+	/* Check the bootconfig magic bytes */
+	if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0)
+		return 0;
+
+	if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) {
 		pr_err("Failed to lseek: %d\n", -errno);
 		return -errno;
 	}
@@ -150,11 +161,14 @@ int load_xbc_from_initrd(int fd, char **buf)
 	if (read(fd, &csum, sizeof(u32)) < 0)
 		return -errno;
 
-	/* Wrong size, maybe no boot config here */
-	if (stat.st_size < size + 8)
-		return 0;
+	/* Wrong size error  */
+	if (stat.st_size < size + 8 + BOOTCONFIG_MAGIC_LEN) {
+		pr_err("bootconfig size is too big\n");
+		return -E2BIG;
+	}
 
-	if (lseek(fd, stat.st_size - 8 - size, SEEK_SET) < 0) {
+	if (lseek(fd, stat.st_size - (size + 8 + BOOTCONFIG_MAGIC_LEN),
+		  SEEK_SET) < 0) {
 		pr_err("Failed to lseek: %d\n", -errno);
 		return -errno;
 	}
@@ -163,17 +177,17 @@ int load_xbc_from_initrd(int fd, char **buf)
 	if (ret < 0)
 		return ret;
 
-	/* Wrong Checksum, maybe no boot config here */
+	/* Wrong Checksum */
 	rcsum = checksum((unsigned char *)*buf, size);
 	if (csum != rcsum) {
 		pr_err("checksum error: %d != %d\n", csum, rcsum);
-		return 0;
+		return -EINVAL;
 	}
 
 	ret = xbc_init(*buf);
-	/* Wrong data, maybe no boot config here */
+	/* Wrong data */
 	if (ret < 0)
-		return 0;
+		return ret;
 
 	return size;
 }
@@ -226,7 +240,8 @@ int delete_xbc(const char *path)
 	} else if (size > 0) {
 		ret = fstat(fd, &stat);
 		if (!ret)
-			ret = ftruncate(fd, stat.st_size - size - 8);
+			ret = ftruncate(fd, stat.st_size
+					- size - 8 - BOOTCONFIG_MAGIC_LEN);
 		if (ret)
 			ret = -errno;
 	} /* Ignore if there is no boot config in initrd */
@@ -295,6 +310,12 @@ int apply_xbc(const char *path, const char *xbc_path)
 		pr_err("Failed to apply a boot config: %d\n", ret);
 		return ret;
 	}
+	/* Write a magic word of the bootconfig */
+	ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN);
+	if (ret < 0) {
+		pr_err("Failed to apply a boot config magic: %d\n", ret);
+		return ret;
+	}
 	close(fd);
 	free(data);
 
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index 1de06de328e2..adafb7c50940 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -49,7 +49,7 @@ xpass $BOOTCONF -a $TEMPCONF $INITRD
 new_size=$(stat -c %s $INITRD)
 
 echo "File size check"
-xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9)
+xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12)
 
 echo "Apply command repeat test"
 xpass $BOOTCONF -a $TEMPCONF $INITRD

From 15e95037b45f24f9ab6d4f0bd101d4df0be24c1d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:18:52 +0900
Subject: [PATCH 104/239] tools/bootconfig: Remove unneeded error message
 silencer

Remove error message silent knob, we don't need it anymore
because we can check if there is a bootconfig by checking
the magic word.
If there is a magic word, but failed to load a bootconfig
from initrd, there is a real problem.

Link: http://lkml.kernel.org/r/158220113256.26565.14264598654427773104.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 tools/bootconfig/include/linux/printk.h | 5 +----
 tools/bootconfig/main.c                 | 8 --------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/tools/bootconfig/include/linux/printk.h b/tools/bootconfig/include/linux/printk.h
index e978a63d3222..036e667596eb 100644
--- a/tools/bootconfig/include/linux/printk.h
+++ b/tools/bootconfig/include/linux/printk.h
@@ -4,10 +4,7 @@
 
 #include <stdio.h>
 
-/* controllable printf */
-extern int pr_output;
-#define printk(fmt, ...)	\
-	(pr_output ? printf(fmt, ##__VA_ARGS__) : 0)
+#define printk(fmt, ...) printf(fmt, ##__VA_ARGS__)
 
 #define pr_err printk
 #define pr_warn	printk
diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
index 742271f019a9..a9b97814d1a9 100644
--- a/tools/bootconfig/main.c
+++ b/tools/bootconfig/main.c
@@ -14,8 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/bootconfig.h>
 
-int pr_output = 1;
-
 static int xbc_show_array(struct xbc_node *node)
 {
 	const char *val;
@@ -227,13 +225,7 @@ int delete_xbc(const char *path)
 		return -errno;
 	}
 
-	/*
-	 * Suppress error messages in xbc_init() because it can be just a
-	 * data which concidentally matches the size and checksum footer.
-	 */
-	pr_output = 0;
 	size = load_xbc_from_initrd(fd, &buf);
-	pr_output = 1;
 	if (size < 0) {
 		ret = size;
 		pr_err("Failed to load a boot config from initrd: %d\n", ret);

From a24d286f36104ed45108a5a36f3868938434772f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:19:12 +0900
Subject: [PATCH 105/239] bootconfig: Reject subkey and value on same parent
 key

Reject if a value node is mixed with subkey node on same
parent key node.

A value node can not co-exist with subkey node under some key
node, e.g.

key = value
key.subkey = another-value

This is not be allowed because bootconfig API is not designed
to handle such case.

Link: http://lkml.kernel.org/r/158220115232.26565.7792340045009731803.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/admin-guide/bootconfig.rst     |  7 +++++++
 lib/bootconfig.c                             | 16 ++++++++++++----
 tools/bootconfig/samples/bad-mixed-kv1.bconf |  3 +++
 tools/bootconfig/samples/bad-mixed-kv2.bconf |  3 +++
 4 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 tools/bootconfig/samples/bad-mixed-kv1.bconf
 create mode 100644 tools/bootconfig/samples/bad-mixed-kv2.bconf

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index 5e7609936507..dfeffa73dca3 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -62,6 +62,13 @@ Or more shorter, written as following::
 In both styles, same key words are automatically merged when parsing it
 at boot time. So you can append similar trees or key-values.
 
+Note that a sub-key and a value can not co-exist under a parent key.
+For example, following config is NOT allowed.::
+
+ foo = value1
+ foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
+
+
 Comments
 --------
 
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 3ea601a2eba5..54ac623ca781 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -533,7 +533,7 @@ struct xbc_node *find_match_node(struct xbc_node *node, char *k)
 
 static int __init __xbc_add_key(char *k)
 {
-	struct xbc_node *node;
+	struct xbc_node *node, *child;
 
 	if (!xbc_valid_keyword(k))
 		return xbc_parse_error("Invalid keyword", k);
@@ -543,8 +543,12 @@ static int __init __xbc_add_key(char *k)
 
 	if (!last_parent)	/* the first level */
 		node = find_match_node(xbc_nodes, k);
-	else
-		node = find_match_node(xbc_node_get_child(last_parent), k);
+	else {
+		child = xbc_node_get_child(last_parent);
+		if (child && xbc_node_is_value(child))
+			return xbc_parse_error("Subkey is mixed with value", k);
+		node = find_match_node(child, k);
+	}
 
 	if (node)
 		last_parent = node;
@@ -577,7 +581,7 @@ static int __init __xbc_parse_keys(char *k)
 static int __init xbc_parse_kv(char **k, char *v)
 {
 	struct xbc_node *prev_parent = last_parent;
-	struct xbc_node *node;
+	struct xbc_node *node, *child;
 	char *next;
 	int c, ret;
 
@@ -585,6 +589,10 @@ static int __init xbc_parse_kv(char **k, char *v)
 	if (ret)
 		return ret;
 
+	child = xbc_node_get_child(last_parent);
+	if (child && xbc_node_is_key(child))
+		return xbc_parse_error("Value is mixed with subkey", v);
+
 	c = __xbc_parse_value(&v, &next);
 	if (c < 0)
 		return c;
diff --git a/tools/bootconfig/samples/bad-mixed-kv1.bconf b/tools/bootconfig/samples/bad-mixed-kv1.bconf
new file mode 100644
index 000000000000..1761547dd05c
--- /dev/null
+++ b/tools/bootconfig/samples/bad-mixed-kv1.bconf
@@ -0,0 +1,3 @@
+# value -> subkey pattern
+key = value
+key.subkey = another-value
diff --git a/tools/bootconfig/samples/bad-mixed-kv2.bconf b/tools/bootconfig/samples/bad-mixed-kv2.bconf
new file mode 100644
index 000000000000..6b32e0c3878c
--- /dev/null
+++ b/tools/bootconfig/samples/bad-mixed-kv2.bconf
@@ -0,0 +1,3 @@
+# subkey -> value pattern
+key.subkey = value
+key = another-value

From 88b913718db94697497028b85acbec8b180a4333 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Thu, 20 Feb 2020 21:19:42 +0900
Subject: [PATCH 106/239] bootconfig: Print array as multiple commands for
 legacy command line

Print arraied values as multiple same options for legacy
kernel command line. With this rule, if the "kernel.*" and
"init.*" array entries in bootconfig are printed out as
multiple same options, e.g.

kernel {
 console = "ttyS0,115200"
 console += "tty0"
}

will be correctly converted to

console="ttyS0,115200" console="tty0"

in the kernel command line.

Link: http://lkml.kernel.org/r/158220118213.26565.8163300497009463916.stgit@devnote2

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 init/main.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/init/main.c b/init/main.c
index 2fe8dec93e68..c9b1ee6bbb8d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -268,7 +268,6 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size,
 {
 	struct xbc_node *knode, *vnode;
 	char *end = buf + size;
-	char c = '\"';
 	const char *val;
 	int ret;
 
@@ -279,25 +278,20 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size,
 			return ret;
 
 		vnode = xbc_node_get_child(knode);
-		ret = snprintf(buf, rest(buf, end), "%s%c", xbc_namebuf,
-				vnode ? '=' : ' ');
-		if (ret < 0)
-			return ret;
-		buf += ret;
-		if (!vnode)
-			continue;
-
-		c = '\"';
-		xbc_array_for_each_value(vnode, val) {
-			ret = snprintf(buf, rest(buf, end), "%c%s", c, val);
+		if (!vnode) {
+			ret = snprintf(buf, rest(buf, end), "%s ", xbc_namebuf);
+			if (ret < 0)
+				return ret;
+			buf += ret;
+			continue;
+		}
+		xbc_array_for_each_value(vnode, val) {
+			ret = snprintf(buf, rest(buf, end), "%s=\"%s\" ",
+				       xbc_namebuf, val);
 			if (ret < 0)
 				return ret;
 			buf += ret;
-			c = ',';
 		}
-		if (rest(buf, end) > 2)
-			strcpy(buf, "\" ");
-		buf += 2;
 	}
 
 	return buf - (end - size);

From 1d0c3924a92e69bfa91163bda83c12a994b4d106 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 15 Feb 2020 16:40:37 -0500
Subject: [PATCH 107/239] ext4: fix potential race between online resizing and
 write operations

During an online resize an array of pointers to buffer heads gets
replaced so it can get enlarged.  If there is a racing block
allocation or deallocation which uses the old array, and the old array
has gotten reused this can lead to a GPF or some other random kernel
memory getting modified.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443
Link: https://lore.kernel.org/r/20200221053458.730016-2-tytso@mit.edu
Reported-by: Suraj Jitindar Singh <surajjs@amazon.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/balloc.c | 14 +++++++++---
 fs/ext4/ext4.h   | 20 +++++++++++++++++-
 fs/ext4/resize.c | 55 ++++++++++++++++++++++++++++++++++++++----------
 fs/ext4/super.c  | 33 ++++++++++++++++++++---------
 4 files changed, 97 insertions(+), 25 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 5f993a411251..8fd0b3cdab4c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct buffer_head *bh_p;
 
 	if (block_group >= ngroups) {
 		ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
-	if (!sbi->s_group_desc[group_desc]) {
+	bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+	/*
+	 * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+	 * the pointer being dereferenced won't be dereferenced again. By
+	 * looking at the usage in add_new_gdb() the value isn't modified,
+	 * just the pointer, and so it remains valid.
+	 */
+	if (!bh_p) {
 		ext4_error(sb, "Group descriptor not loaded - "
 			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
@@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	}
 
 	desc = (struct ext4_group_desc *)(
-		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		(__u8 *)bh_p->b_data +
 		offset * EXT4_DESC_SIZE(sb));
 	if (bh)
-		*bh = sbi->s_group_desc[group_desc];
+		*bh = bh_p;
 	return desc;
 }
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 480badcf2783..b51003f75568 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1400,7 +1400,7 @@ struct ext4_sb_info {
 	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
 	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
+	struct buffer_head * __rcu *s_group_desc;
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
@@ -1576,6 +1576,23 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
 
+/*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index)				   \
+({									   \
+	typeof(*((sbi)->field)) _v;					   \
+	rcu_read_lock();						   \
+	_v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];	   \
+	rcu_read_unlock();						   \
+	_v;								   \
+})
+
 /*
  * Simulate_fail codes
  */
@@ -2730,6 +2747,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
 extern bool ext4_empty_dir(struct inode *inode);
 
 /* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
 extern int ext4_group_add(struct super_block *sb,
 				struct ext4_new_group_data *input);
 extern int ext4_group_extend(struct super_block *sb,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 86a2500ed292..536cc9f38091 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -17,6 +17,33 @@
 
 #include "ext4_jbd2.h"
 
+struct ext4_rcu_ptr {
+	struct rcu_head rcu;
+	void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+	struct ext4_rcu_ptr *ptr;
+
+	ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+	kvfree(ptr->ptr);
+	kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+	struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+	if (ptr) {
+		ptr->ptr = to_free;
+		call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+		return;
+	}
+	synchronize_rcu();
+	kvfree(to_free);
+}
+
 int ext4_resize_begin(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -542,8 +569,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
 				brelse(gdb);
 				goto out;
 			}
-			memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
-			       gdb->b_size);
+			memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+				s_group_desc, j)->b_data, gdb->b_size);
 			set_buffer_uptodate(gdb);
 
 			err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -860,13 +887,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	}
 	brelse(dind);
 
-	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	rcu_read_lock();
+	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	rcu_read_unlock();
 	n_group_desc[gdb_num] = gdb_bh;
-	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
 	EXT4_SB(sb)->s_gdb_count++;
-	kvfree(o_group_desc);
+	ext4_kvfree_array_rcu(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
 	err = ext4_handle_dirty_super(handle, sb);
@@ -909,9 +938,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 		return err;
 	}
 
-	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	rcu_read_lock();
+	o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	rcu_read_unlock();
 	n_group_desc[gdb_num] = gdb_bh;
 
 	BUFFER_TRACE(gdb_bh, "get_write_access");
@@ -922,9 +953,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 		return err;
 	}
 
-	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
 	EXT4_SB(sb)->s_gdb_count++;
-	kvfree(o_group_desc);
+	ext4_kvfree_array_rcu(o_group_desc);
 	return err;
 }
 
@@ -1188,7 +1219,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
 		 * use non-sparse filesystems anymore.  This is already checked above.
 		 */
 		if (gdb_off) {
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			BUFFER_TRACE(gdb_bh, "get_write_access");
 			err = ext4_journal_get_write_access(handle, gdb_bh);
 
@@ -1270,7 +1302,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
 		/*
 		 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
 		 */
-		gdb_bh = sbi->s_group_desc[gdb_num];
+		gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
 		/* Update group descriptor block for new group */
 		gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
 						 gdb_off * EXT4_DESC_SIZE(sb));
@@ -1497,7 +1529,8 @@ static int ext4_flex_group_add(struct super_block *sb,
 		for (; gdb_num <= gdb_num_end; gdb_num++) {
 			struct buffer_head *gdb_bh;
 
-			gdb_bh = sbi->s_group_desc[gdb_num];
+			gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+						     gdb_num);
 			if (old_gdb == gdb_bh->b_blocknr)
 				continue;
 			update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f464dff09774..e00bcc19099f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1014,6 +1014,7 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	struct buffer_head **group_desc;
 	int aborted = 0;
 	int i, err;
 
@@ -1046,9 +1047,12 @@ static void ext4_put_super(struct super_block *sb)
 	if (!sb_rdonly(sb))
 		ext4_commit_super(sb, 1);
 
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < sbi->s_gdb_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	rcu_read_unlock();
 	kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3634,7 +3638,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
 	char *orig_data = kstrdup(data, GFP_KERNEL);
-	struct buffer_head *bh;
+	struct buffer_head *bh, **group_desc;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	ext4_fsblk_t block;
@@ -4290,9 +4294,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount;
 		}
 	}
-	sbi->s_group_desc = kvmalloc_array(db_count,
-					   sizeof(struct buffer_head *),
-					   GFP_KERNEL);
+	rcu_assign_pointer(sbi->s_group_desc,
+			   kvmalloc_array(db_count,
+					  sizeof(struct buffer_head *),
+					  GFP_KERNEL));
 	if (sbi->s_group_desc == NULL) {
 		ext4_msg(sb, KERN_ERR, "not enough memory");
 		ret = -ENOMEM;
@@ -4308,14 +4313,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	for (i = 0; i < db_count; i++) {
+		struct buffer_head *bh;
+
 		block = descriptor_loc(sb, logical_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
-		if (!sbi->s_group_desc[i]) {
+		bh = sb_bread_unmovable(sb, block);
+		if (!bh) {
 			ext4_msg(sb, KERN_ERR,
 			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
+		rcu_read_lock();
+		rcu_dereference(sbi->s_group_desc)[i] = bh;
+		rcu_read_unlock();
 	}
 	sbi->s_gdb_count = db_count;
 	if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4717,9 +4727,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
+	rcu_read_lock();
+	group_desc = rcu_dereference(sbi->s_group_desc);
 	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
-	kvfree(sbi->s_group_desc);
+		brelse(group_desc[i]);
+	kvfree(group_desc);
+	rcu_read_unlock();
 failed_mount:
 	if (sbi->s_chksum_driver)
 		crypto_free_shash(sbi->s_chksum_driver);

From df3da4ea5a0fc5d115c90d5aa6caa4dd433750a7 Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 18 Feb 2020 19:08:50 -0800
Subject: [PATCH 108/239] ext4: fix potential race between s_group_info online
 resizing and access

During an online resize an array of pointers to s_group_info gets replaced
so it can get enlarged. If there is a concurrent access to the array in
ext4_get_group_info() and this memory has been reused then this can lead to
an invalid memory access.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443
Link: https://lore.kernel.org/r/20200221053458.730016-3-tytso@mit.edu
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Balbir Singh <sblbir@amazon.com>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h    |  8 ++++----
 fs/ext4/mballoc.c | 52 +++++++++++++++++++++++++++++++----------------
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b51003f75568..b1ece5329738 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1462,7 +1462,7 @@ struct ext4_sb_info {
 #endif
 
 	/* for buddy allocator */
-	struct ext4_group_info ***s_group_info;
+	struct ext4_group_info ** __rcu *s_group_info;
 	struct inode *s_buddy_cache;
 	spinlock_t s_md_lock;
 	unsigned short *s_mb_offsets;
@@ -2994,13 +2994,13 @@ static inline
 struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 					    ext4_group_t group)
 {
-	 struct ext4_group_info ***grp_info;
+	 struct ext4_group_info **grp_info;
 	 long indexv, indexh;
 	 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
-	 grp_info = EXT4_SB(sb)->s_group_info;
 	 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
 	 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
-	 return grp_info[indexv][indexh];
+	 grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+	 return grp_info[indexh];
 }
 
 /*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f64838187559..1b46fb63692a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2356,7 +2356,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	unsigned size;
-	struct ext4_group_info ***new_groupinfo;
+	struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
 
 	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
 		EXT4_DESC_PER_BLOCK_BITS(sb);
@@ -2369,13 +2369,16 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
 		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
 		return -ENOMEM;
 	}
-	if (sbi->s_group_info) {
-		memcpy(new_groupinfo, sbi->s_group_info,
+	rcu_read_lock();
+	old_groupinfo = rcu_dereference(sbi->s_group_info);
+	if (old_groupinfo)
+		memcpy(new_groupinfo, old_groupinfo,
 		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
-		kvfree(sbi->s_group_info);
-	}
-	sbi->s_group_info = new_groupinfo;
+	rcu_read_unlock();
+	rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
 	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+	if (old_groupinfo)
+		ext4_kvfree_array_rcu(old_groupinfo);
 	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
 		   sbi->s_group_info_size);
 	return 0;
@@ -2387,6 +2390,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 {
 	int i;
 	int metalen = 0;
+	int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_info **meta_group_info;
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2405,12 +2409,12 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 				 "for a buddy group");
 			goto exit_meta_group_info;
 		}
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
-			meta_group_info;
+		rcu_read_lock();
+		rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
+		rcu_read_unlock();
 	}
 
-	meta_group_info =
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
 	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
 
 	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
@@ -2458,8 +2462,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 exit_group_info:
 	/* If a meta_group_info table has been allocated, release it now */
 	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
-		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
-		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+		struct ext4_group_info ***group_info;
+
+		rcu_read_lock();
+		group_info = rcu_dereference(sbi->s_group_info);
+		kfree(group_info[idx]);
+		group_info[idx] = NULL;
+		rcu_read_unlock();
 	}
 exit_meta_group_info:
 	return -ENOMEM;
@@ -2472,6 +2481,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int err;
 	struct ext4_group_desc *desc;
+	struct ext4_group_info ***group_info;
 	struct kmem_cache *cachep;
 
 	err = ext4_mb_alloc_groupinfo(sb, ngroups);
@@ -2507,11 +2517,16 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	while (i-- > 0)
 		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
 	i = sbi->s_group_info_size;
+	rcu_read_lock();
+	group_info = rcu_dereference(sbi->s_group_info);
 	while (i-- > 0)
-		kfree(sbi->s_group_info[i]);
+		kfree(group_info[i]);
+	rcu_read_unlock();
 	iput(sbi->s_buddy_cache);
 err_freesgi:
-	kvfree(sbi->s_group_info);
+	rcu_read_lock();
+	kvfree(rcu_dereference(sbi->s_group_info));
+	rcu_read_unlock();
 	return -ENOMEM;
 }
 
@@ -2700,7 +2715,7 @@ int ext4_mb_release(struct super_block *sb)
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
-	struct ext4_group_info *grinfo;
+	struct ext4_group_info *grinfo, ***group_info;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
@@ -2719,9 +2734,12 @@ int ext4_mb_release(struct super_block *sb)
 		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
+		rcu_read_lock();
+		group_info = rcu_dereference(sbi->s_group_info);
 		for (i = 0; i < num_meta_group_infos; i++)
-			kfree(sbi->s_group_info[i]);
-		kvfree(sbi->s_group_info);
+			kfree(group_info[i]);
+		kvfree(group_info);
+		rcu_read_unlock();
 	}
 	kfree(sbi->s_mb_offsets);
 	kfree(sbi->s_mb_maxs);

From fd1d98650ac0042d475155116e65fd17eb379542 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 8 Oct 2019 14:37:02 +0800
Subject: [PATCH 109/239] MAINTAINERS: csky: Add mailing list for csky

Add mailing list and it's convenient for maintain C-SKY
subsystem.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a0d86490c2c6..a4b5858b29bd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3649,6 +3649,7 @@ F:	sound/pci/oxygen/
 
 C-SKY ARCHITECTURE
 M:	Guo Ren <guoren@kernel.org>
+L:	linux-csky@vger.kernel.org
 T:	git https://github.com/c-sky/csky-linux.git
 S:	Supported
 F:	arch/csky/

From 2f78c73f78c39dabc5c44ad8dd61fd6ec65636d6 Mon Sep 17 00:00:00 2001
From: Mao Han <han_mao@c-sky.com>
Date: Fri, 11 Oct 2019 10:56:55 +0800
Subject: [PATCH 110/239] csky: Initial stack protector support

This is a basic -fstack-protector support without per-task canary
switching. The protector will report something like when stack
corruption is detected:

It's tested with strcpy local array overflow in sys_kill and get:
stack-protector: Kernel stack is corrupted in: sys_kill+0x23c/0x23c

TODO:
 - Support task switch for different cannary

Signed-off-by: Mao Han <han_mao@c-sky.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig                      |  1 +
 arch/csky/include/asm/stackprotector.h | 29 ++++++++++++++++++++++++++
 arch/csky/kernel/process.c             |  6 ++++++
 3 files changed, 36 insertions(+)
 create mode 100644 arch/csky/include/asm/stackprotector.h

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index da09c884cc30..ea7f3b890d03 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -49,6 +49,7 @@ config CSKY
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
+	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
 	select MAY_HAVE_SPARSE_IRQ
 	select MODULES_USE_ELF_RELA if MODULES
diff --git a/arch/csky/include/asm/stackprotector.h b/arch/csky/include/asm/stackprotector.h
new file mode 100644
index 000000000000..d7cd4e51edd9
--- /dev/null
+++ b/arch/csky/include/asm/stackprotector.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H 1
+
+#include <linux/random.h>
+#include <linux/version.h>
+
+extern unsigned long __stack_chk_guard;
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+	unsigned long canary;
+
+	/* Try to get a semi random initial value. */
+	get_random_bytes(&canary, sizeof(canary));
+	canary ^= LINUX_VERSION_CODE;
+	canary &= CANARY_MASK;
+
+	current->stack_canary = canary;
+	__stack_chk_guard = current->stack_canary;
+}
+
+#endif /* __ASM_SH_STACKPROTECTOR_H */
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index f320d9248a22..5349cd8c0f30 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -16,6 +16,12 @@
 
 struct cpuinfo_csky cpu_data[NR_CPUS];
 
+#ifdef CONFIG_STACKPROTECTOR
+#include <linux/stackprotector.h>
+unsigned long __stack_chk_guard __read_mostly;
+EXPORT_SYMBOL(__stack_chk_guard);
+#endif
+
 asmlinkage void ret_from_fork(void);
 asmlinkage void ret_from_kernel_thread(void);
 

From f525bb2c9e7cf1e3c43ab57704c9e1c836d30b34 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Wed, 27 Nov 2019 08:44:33 +0800
Subject: [PATCH 111/239] csky: Tightly-Coupled Memory or Sram support

The implementation are not only used by TCM but also used by sram on
SOC bus. It follow existed linux tcm software interface, so that old
tcm application codes could be re-used directly.

Software interface list in asm/tcm.h:
 - Variables/Const: 	__tcmdata, __tcmconst
 - Functions:		__tcmfunc, __tcmlocalfunc
 - Malloc/Free:		tcm_alloc, tcm_free

In linux menuconfig:
 - Choose a TCM contain instrctions + data or separated in ITCM/DTCM.
 - Determine TCM_BASE (DTCM_BASE) in phyiscal address.
 - Determine size of TCM or ITCM(DTCM) in page counts.

Here is hello tcm example from Documentation/arm/tcm.rst which could
be directly used:

/* Uninitialized data */
static u32 __tcmdata tcmvar;
/* Initialized data */
static u32 __tcmdata tcmassigned = 0x2BADBABEU;
/* Constant */
static const u32 __tcmconst tcmconst = 0xCAFEBABEU;

static void __tcmlocalfunc tcm_to_tcm(void)
{
	int i;
	for (i = 0; i < 100; i++)
		tcmvar ++;
}

static void __tcmfunc hello_tcm(void)
{
	/* Some abstract code that runs in ITCM */
	int i;
	for (i = 0; i < 100; i++) {
		tcmvar ++;
	}
	tcm_to_tcm();
}

static void __init test_tcm(void)
{
	u32 *tcmem;
	int i;

	hello_tcm();
	printk("Hello TCM executed from ITCM RAM\n");

	printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar);
	tcmvar = 0xDEADBEEFU;
	printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar);

	printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned);

	printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst);

	/* Allocate some TCM memory from the pool */
	tcmem = tcm_alloc(20);
	if (tcmem) {
		printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem);
		tcmem[0] = 0xDEADBEEFU;
		tcmem[1] = 0x2BADBABEU;
		tcmem[2] = 0xCAFEBABEU;
		tcmem[3] = 0xDEADBEEFU;
		tcmem[4] = 0x2BADBABEU;
		for (i = 0; i < 5; i++)
			printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]);
		tcm_free(tcmem, 20);
	}
}

TODO:
 - Separate fixup mapping from highmem
 - Support abiv1

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig              |  35 +++++++
 arch/csky/include/asm/fixmap.h |   5 +-
 arch/csky/include/asm/memory.h |  22 +++++
 arch/csky/include/asm/tcm.h    |  24 +++++
 arch/csky/kernel/vmlinux.lds.S |  49 ++++++++++
 arch/csky/mm/Makefile          |   1 +
 arch/csky/mm/tcm.c             | 169 +++++++++++++++++++++++++++++++++
 7 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 arch/csky/include/asm/memory.h
 create mode 100644 arch/csky/include/asm/tcm.h
 create mode 100644 arch/csky/mm/tcm.c

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index ea7f3b890d03..fc92f08f0017 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -189,6 +189,41 @@ config CPU_PM_STOP
 	bool "stop"
 endchoice
 
+menuconfig HAVE_TCM
+	bool "Tightly-Coupled/Sram Memory"
+	depends on HIGHMEM
+	select GENERIC_ALLOCATOR
+	help
+	  The implementation are not only used by TCM (Tightly-Coupled Meory)
+	  but also used by sram on SOC bus. It follow existed linux tcm
+	  software interface, so that old tcm application codes could be
+	  re-used directly.
+
+if HAVE_TCM
+config ITCM_RAM_BASE
+	hex "ITCM ram base"
+	default 0xffffffff
+
+config ITCM_NR_PAGES
+	int "Page count of ITCM size: NR*4KB"
+	range 1 256
+	default 32
+
+config HAVE_DTCM
+	bool "DTCM Support"
+
+config DTCM_RAM_BASE
+	hex "DTCM ram base"
+	depends on HAVE_DTCM
+	default 0xffffffff
+
+config DTCM_NR_PAGES
+	int "Page count of DTCM size: NR*4KB"
+	depends on HAVE_DTCM
+	range 1 256
+	default 32
+endif
+
 config CPU_HAS_VDSP
 	bool "CPU has VDSP coprocessor"
 	depends on CPU_HAS_FPU && CPU_HAS_FPUV2
diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h
index 380ff0a307df..4350578faa1c 100644
--- a/arch/csky/include/asm/fixmap.h
+++ b/arch/csky/include/asm/fixmap.h
@@ -5,12 +5,16 @@
 #define __ASM_CSKY_FIXMAP_H
 
 #include <asm/page.h>
+#include <asm/memory.h>
 #ifdef CONFIG_HIGHMEM
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
 #endif
 
 enum fixed_addresses {
+#ifdef CONFIG_HAVE_TCM
+	FIX_TCM = TCM_NR_PAGES,
+#endif
 #ifdef CONFIG_HIGHMEM
 	FIX_KMAP_BEGIN,
 	FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
@@ -18,7 +22,6 @@ enum fixed_addresses {
 	__end_of_fixed_addresses
 };
 
-#define FIXADDR_TOP	0xffffc000
 #define FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
 
diff --git a/arch/csky/include/asm/memory.h b/arch/csky/include/asm/memory.h
new file mode 100644
index 000000000000..cdffbd0a500b
--- /dev/null
+++ b/arch/csky/include/asm/memory.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_MEMORY_H
+#define __ASM_CSKY_MEMORY_H
+
+#include <linux/compiler.h>
+#include <linux/const.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+
+#define FIXADDR_TOP	_AC(0xffffc000, UL)
+
+#ifdef CONFIG_HAVE_TCM
+#ifdef CONFIG_HAVE_DTCM
+#define TCM_NR_PAGES	(CONFIG_ITCM_NR_PAGES + CONFIG_DTCM_NR_PAGES)
+#else
+#define TCM_NR_PAGES	(CONFIG_ITCM_NR_PAGES)
+#endif
+#define FIXADDR_TCM	_AC(FIXADDR_TOP - (TCM_NR_PAGES * PAGE_SIZE), UL)
+#endif
+
+#endif
diff --git a/arch/csky/include/asm/tcm.h b/arch/csky/include/asm/tcm.h
new file mode 100644
index 000000000000..2b135cefb73f
--- /dev/null
+++ b/arch/csky/include/asm/tcm.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_TCM_H
+#define __ASM_CSKY_TCM_H
+
+#ifndef CONFIG_HAVE_TCM
+#error "You should not be including tcm.h unless you have a TCM!"
+#endif
+
+#include <linux/compiler.h>
+
+/* Tag variables with this */
+#define __tcmdata __section(.tcm.data)
+/* Tag constants with this */
+#define __tcmconst __section(.tcm.rodata)
+/* Tag functions inside TCM called from outside TCM with this */
+#define __tcmfunc __section(.tcm.text) noinline
+/* Tag function inside TCM called from inside TCM  with this */
+#define __tcmlocalfunc __section(.tcm.text)
+
+void *tcm_alloc(size_t len);
+void tcm_free(void *addr, size_t len);
+
+#endif
diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S
index 2ff37beaf2bf..f05b413df328 100644
--- a/arch/csky/kernel/vmlinux.lds.S
+++ b/arch/csky/kernel/vmlinux.lds.S
@@ -2,6 +2,7 @@
 
 #include <asm/vmlinux.lds.h>
 #include <asm/page.h>
+#include <asm/memory.h>
 
 OUTPUT_ARCH(csky)
 ENTRY(_start)
@@ -53,6 +54,54 @@ SECTIONS
 	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
 	_edata = .;
 
+#ifdef CONFIG_HAVE_TCM
+	.tcm_start : {
+		. = ALIGN(PAGE_SIZE);
+		__tcm_start = .;
+	}
+
+	.text_data_tcm FIXADDR_TCM : AT(__tcm_start)
+	{
+		. = ALIGN(4);
+		__stcm_text_data = .;
+		*(.tcm.text)
+		*(.tcm.rodata)
+#ifndef CONFIG_HAVE_DTCM
+		*(.tcm.data)
+#endif
+		. = ALIGN(4);
+		__etcm_text_data = .;
+	}
+
+	. = ADDR(.tcm_start) + SIZEOF(.tcm_start) + SIZEOF(.text_data_tcm);
+
+#ifdef CONFIG_HAVE_DTCM
+	#define ITCM_SIZE	CONFIG_ITCM_NR_PAGES * PAGE_SIZE
+
+	.dtcm_start : {
+		__dtcm_start = .;
+	}
+
+	.data_tcm FIXADDR_TCM + ITCM_SIZE : AT(__dtcm_start)
+	{
+		. = ALIGN(4);
+		__stcm_data = .;
+		*(.tcm.data)
+		. = ALIGN(4);
+		__etcm_data = .;
+	}
+
+	. = ADDR(.dtcm_start) + SIZEOF(.data_tcm);
+
+	.tcm_end : AT(ADDR(.dtcm_start) + SIZEOF(.data_tcm)) {
+#else
+	.tcm_end : AT(ADDR(.tcm_start) + SIZEOF(.text_data_tcm)) {
+#endif
+		. = ALIGN(PAGE_SIZE);
+		__tcm_end = .;
+	}
+#endif
+
 	EXCEPTION_TABLE(L1_CACHE_BYTES)
 	BSS_SECTION(L1_CACHE_BYTES, PAGE_SIZE, L1_CACHE_BYTES)
 	VBR_BASE
diff --git a/arch/csky/mm/Makefile b/arch/csky/mm/Makefile
index c94ef6481098..2c51918eb4fc 100644
--- a/arch/csky/mm/Makefile
+++ b/arch/csky/mm/Makefile
@@ -14,3 +14,4 @@ obj-y +=			syscache.o
 obj-y +=			tlb.o
 obj-y +=			asid.o
 obj-y +=			context.o
+obj-$(CONFIG_HAVE_TCM) +=	tcm.o
diff --git a/arch/csky/mm/tcm.c b/arch/csky/mm/tcm.c
new file mode 100644
index 000000000000..ddeb36328819
--- /dev/null
+++ b/arch/csky/mm/tcm.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/highmem.h>
+#include <linux/genalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
+
+#if (CONFIG_ITCM_RAM_BASE == 0xffffffff)
+#error "You should define ITCM_RAM_BASE"
+#endif
+
+#ifdef CONFIG_HAVE_DTCM
+#if (CONFIG_DTCM_RAM_BASE == 0xffffffff)
+#error "You should define DTCM_RAM_BASE"
+#endif
+
+#if (CONFIG_DTCM_RAM_BASE == CONFIG_ITCM_RAM_BASE)
+#error "You should define correct DTCM_RAM_BASE"
+#endif
+#endif
+
+extern char __tcm_start, __tcm_end, __dtcm_start;
+
+static struct gen_pool *tcm_pool;
+
+static void __init tcm_mapping_init(void)
+{
+	pte_t *tcm_pte;
+	unsigned long vaddr, paddr;
+	int i;
+
+	paddr = CONFIG_ITCM_RAM_BASE;
+
+	if (pfn_valid(PFN_DOWN(CONFIG_ITCM_RAM_BASE)))
+		goto panic;
+
+#ifndef CONFIG_HAVE_DTCM
+	for (i = 0; i < TCM_NR_PAGES; i++) {
+#else
+	for (i = 0; i < CONFIG_ITCM_NR_PAGES; i++) {
+#endif
+		vaddr = __fix_to_virt(FIX_TCM - i);
+
+		tcm_pte =
+			pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr);
+
+		set_pte(tcm_pte, pfn_pte(__phys_to_pfn(paddr), PAGE_KERNEL));
+
+		flush_tlb_one(vaddr);
+
+		paddr = paddr + PAGE_SIZE;
+	}
+
+#ifdef CONFIG_HAVE_DTCM
+	if (pfn_valid(PFN_DOWN(CONFIG_DTCM_RAM_BASE)))
+		goto panic;
+
+	paddr = CONFIG_DTCM_RAM_BASE;
+
+	for (i = 0; i < CONFIG_DTCM_NR_PAGES; i++) {
+		vaddr = __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES - i);
+
+		tcm_pte =
+			pte_offset_kernel((pmd_t *) pgd_offset_k(vaddr), vaddr);
+
+		set_pte(tcm_pte, pfn_pte(__phys_to_pfn(paddr), PAGE_KERNEL));
+
+		flush_tlb_one(vaddr);
+
+		paddr = paddr + PAGE_SIZE;
+	}
+#endif
+
+#ifndef CONFIG_HAVE_DTCM
+	memcpy((void *)__fix_to_virt(FIX_TCM),
+				&__tcm_start, &__tcm_end - &__tcm_start);
+
+	pr_info("%s: mapping tcm va:0x%08lx to pa:0x%08x\n",
+			__func__, __fix_to_virt(FIX_TCM), CONFIG_ITCM_RAM_BASE);
+
+	pr_info("%s: __tcm_start va:0x%08lx size:%d\n",
+			__func__, (unsigned long)&__tcm_start, &__tcm_end - &__tcm_start);
+#else
+	memcpy((void *)__fix_to_virt(FIX_TCM),
+				&__tcm_start, &__dtcm_start - &__tcm_start);
+
+	pr_info("%s: mapping itcm va:0x%08lx to pa:0x%08x\n",
+			__func__, __fix_to_virt(FIX_TCM), CONFIG_ITCM_RAM_BASE);
+
+	pr_info("%s: __itcm_start va:0x%08lx size:%d\n",
+			__func__, (unsigned long)&__tcm_start, &__dtcm_start - &__tcm_start);
+
+	memcpy((void *)__fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES),
+				&__dtcm_start, &__tcm_end - &__dtcm_start);
+
+	pr_info("%s: mapping dtcm va:0x%08lx to pa:0x%08x\n",
+			__func__, __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES),
+						CONFIG_DTCM_RAM_BASE);
+
+	pr_info("%s: __dtcm_start va:0x%08lx size:%d\n",
+			__func__, (unsigned long)&__dtcm_start, &__tcm_end - &__dtcm_start);
+
+#endif
+	return;
+panic:
+	panic("TCM init error");
+}
+
+void *tcm_alloc(size_t len)
+{
+	unsigned long vaddr;
+
+	if (!tcm_pool)
+		return NULL;
+
+	vaddr = gen_pool_alloc(tcm_pool, len);
+	if (!vaddr)
+		return NULL;
+
+	return (void *) vaddr;
+}
+EXPORT_SYMBOL(tcm_alloc);
+
+void tcm_free(void *addr, size_t len)
+{
+	gen_pool_free(tcm_pool, (unsigned long) addr, len);
+}
+EXPORT_SYMBOL(tcm_free);
+
+static int __init tcm_setup_pool(void)
+{
+#ifndef CONFIG_HAVE_DTCM
+	u32 pool_size = (u32) (TCM_NR_PAGES * PAGE_SIZE)
+				- (u32) (&__tcm_end - &__tcm_start);
+
+	u32 tcm_pool_start = __fix_to_virt(FIX_TCM)
+				+ (u32) (&__tcm_end - &__tcm_start);
+#else
+	u32 pool_size = (u32) (CONFIG_DTCM_NR_PAGES * PAGE_SIZE)
+				- (u32) (&__tcm_end - &__dtcm_start);
+
+	u32 tcm_pool_start = __fix_to_virt(FIX_TCM - CONFIG_ITCM_NR_PAGES)
+				+ (u32) (&__tcm_end - &__dtcm_start);
+#endif
+	int ret;
+
+	tcm_pool = gen_pool_create(2, -1);
+
+	ret = gen_pool_add(tcm_pool, tcm_pool_start, pool_size, -1);
+	if (ret) {
+		pr_err("%s: gen_pool add failed!\n", __func__);
+		return ret;
+	}
+
+	pr_info("%s: Added %d bytes @ 0x%08x to memory pool\n",
+		__func__, pool_size, tcm_pool_start);
+
+	return 0;
+}
+
+static int __init tcm_init(void)
+{
+	tcm_mapping_init();
+
+	tcm_setup_pool();
+
+	return 0;
+}
+arch_initcall(tcm_init);

From f136008f31e91060d6b6e6e031cb2c827448e280 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sun, 1 Dec 2019 22:34:19 +0800
Subject: [PATCH 112/239] csky: Separate fixaddr_init from highmem

After fixaddr_init is separated from highmem, we could use tcm
without highmem selected. (610 (abiv1) don't support highmem,
but it could use tcm now.)

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig               |  1 -
 arch/csky/include/asm/fixmap.h  |  4 +++
 arch/csky/include/asm/memory.h  |  3 ++
 arch/csky/include/asm/pgtable.h |  6 +---
 arch/csky/kernel/setup.c        |  2 ++
 arch/csky/mm/highmem.c          | 62 ++-------------------------------
 arch/csky/mm/init.c             | 47 +++++++++++++++++++++++++
 7 files changed, 60 insertions(+), 65 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index fc92f08f0017..00ac7bc5558a 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -191,7 +191,6 @@ endchoice
 
 menuconfig HAVE_TCM
 	bool "Tightly-Coupled/Sram Memory"
-	depends on HIGHMEM
 	select GENERIC_ALLOCATOR
 	help
 	  The implementation are not only used by TCM (Tightly-Coupled Meory)
diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h
index 4350578faa1c..81f9477d5330 100644
--- a/arch/csky/include/asm/fixmap.h
+++ b/arch/csky/include/asm/fixmap.h
@@ -27,4 +27,8 @@ enum fixed_addresses {
 
 #include <asm-generic/fixmap.h>
 
+extern void fixrange_init(unsigned long start, unsigned long end,
+	pgd_t *pgd_base);
+extern void __init fixaddr_init(void);
+
 #endif /* __ASM_CSKY_FIXMAP_H */
diff --git a/arch/csky/include/asm/memory.h b/arch/csky/include/asm/memory.h
index cdffbd0a500b..a65c6759f537 100644
--- a/arch/csky/include/asm/memory.h
+++ b/arch/csky/include/asm/memory.h
@@ -9,6 +9,9 @@
 #include <linux/sizes.h>
 
 #define FIXADDR_TOP	_AC(0xffffc000, UL)
+#define PKMAP_BASE	_AC(0xff800000, UL)
+#define VMALLOC_START	_AC(0xc0008000, UL)
+#define VMALLOC_END	(PKMAP_BASE - (PAGE_SIZE * 2))
 
 #ifdef CONFIG_HAVE_TCM
 #ifdef CONFIG_HAVE_DTCM
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index 4b2a41e15f2e..9b7764cb7645 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -5,6 +5,7 @@
 #define __ASM_CSKY_PGTABLE_H
 
 #include <asm/fixmap.h>
+#include <asm/memory.h>
 #include <asm/addrspace.h>
 #include <abi/pgtable-bits.h>
 #include <asm-generic/pgtable-nopmd.h>
@@ -16,11 +17,6 @@
 #define USER_PTRS_PER_PGD	(0x80000000UL/PGDIR_SIZE)
 #define FIRST_USER_ADDRESS	0UL
 
-#define PKMAP_BASE		(0xff800000)
-
-#define VMALLOC_START		(0xc0008000)
-#define VMALLOC_END		(PKMAP_BASE - 2*PAGE_SIZE)
-
 /*
  * C-SKY is two-level paging structure:
  */
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 52eaf31ba27f..4fc7b16b77d2 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -133,6 +133,8 @@ void __init setup_arch(char **cmdline_p)
 
 	sparse_init();
 
+	fixaddr_init();
+
 #ifdef CONFIG_HIGHMEM
 	kmap_init();
 #endif
diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c
index 3317b774f6dc..813129145f3d 100644
--- a/arch/csky/mm/highmem.c
+++ b/arch/csky/mm/highmem.c
@@ -117,85 +117,29 @@ struct page *kmap_atomic_to_page(void *ptr)
 	return pte_page(*pte);
 }
 
-static void __init fixrange_init(unsigned long start, unsigned long end,
-				pgd_t *pgd_base)
-{
-#ifdef CONFIG_HIGHMEM
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int i, j, k;
-	unsigned long vaddr;
-
-	vaddr = start;
-	i = __pgd_offset(vaddr);
-	j = __pud_offset(vaddr);
-	k = __pmd_offset(vaddr);
-	pgd = pgd_base + i;
-
-	for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
-		pud = (pud_t *)pgd;
-		for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) {
-			pmd = (pmd_t *)pud;
-			for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) {
-				if (pmd_none(*pmd)) {
-					pte = (pte_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
-					if (!pte)
-						panic("%s: Failed to allocate %lu bytes align=%lx\n",
-						      __func__, PAGE_SIZE,
-						      PAGE_SIZE);
-
-					set_pmd(pmd, __pmd(__pa(pte)));
-					BUG_ON(pte != pte_offset_kernel(pmd, 0));
-				}
-				vaddr += PMD_SIZE;
-			}
-			k = 0;
-		}
-		j = 0;
-	}
-#endif
-}
-
-void __init fixaddr_kmap_pages_init(void)
+static void __init kmap_pages_init(void)
 {
 	unsigned long vaddr;
-	pgd_t *pgd_base;
-#ifdef CONFIG_HIGHMEM
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pud_t *pud;
 	pte_t *pte;
-#endif
-	pgd_base = swapper_pg_dir;
 
-	/*
-	 * Fixed mappings:
-	 */
-	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-	fixrange_init(vaddr, 0, pgd_base);
-
-#ifdef CONFIG_HIGHMEM
-	/*
-	 * Permanent kmaps:
-	 */
 	vaddr = PKMAP_BASE;
-	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir);
 
 	pgd = swapper_pg_dir + __pgd_offset(vaddr);
 	pud = (pud_t *)pgd;
 	pmd = pmd_offset(pud, vaddr);
 	pte = pte_offset_kernel(pmd, vaddr);
 	pkmap_page_table = pte;
-#endif
 }
 
 void __init kmap_init(void)
 {
 	unsigned long vaddr;
 
-	fixaddr_kmap_pages_init();
+	kmap_pages_init();
 
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN);
 
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index d4c2292ea46b..322eb7bd7962 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -101,3 +101,50 @@ void __init pre_mmu_init(void)
 	/* Setup page mask to 4k */
 	write_mmu_pagemask(0);
 }
+
+void __init fixrange_init(unsigned long start, unsigned long end,
+			pgd_t *pgd_base)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int i, j, k;
+	unsigned long vaddr;
+
+	vaddr = start;
+	i = __pgd_offset(vaddr);
+	j = __pud_offset(vaddr);
+	k = __pmd_offset(vaddr);
+	pgd = pgd_base + i;
+
+	for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
+		pud = (pud_t *)pgd;
+		for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) {
+			pmd = (pmd_t *)pud;
+			for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) {
+				if (pmd_none(*pmd)) {
+					pte = (pte_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE);
+					if (!pte)
+						panic("%s: Failed to allocate %lu bytes align=%lx\n",
+						      __func__, PAGE_SIZE,
+						      PAGE_SIZE);
+
+					set_pmd(pmd, __pmd(__pa(pte)));
+					BUG_ON(pte != pte_offset_kernel(pmd, 0));
+				}
+				vaddr += PMD_SIZE;
+			}
+			k = 0;
+		}
+		j = 0;
+	}
+}
+
+void __init fixaddr_init(void)
+{
+	unsigned long vaddr;
+
+	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+	fixrange_init(vaddr, vaddr + PMD_SIZE, swapper_pg_dir);
+}

From 7f4a567332f035ab16b29010fbd04a0f10183c77 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Mon, 30 Dec 2019 15:53:37 +0800
Subject: [PATCH 113/239] csky/mm: Fixup export invalid_pte_table symbol

There is no present bit in csky pmd hardware, so we need to prepare invalid_pte_table
for empty pmd entry and the functions (pmd_none & pmd_present) in pgtable.h need
invalid_pte_talbe to get result. If a module use these functions, we need export the
symbol for it.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Cc: Mo Qihui <qihui.mo@verisilicon.com>
Cc: Zhange Jian <zhang_jian5@dahuatech.com>
---
 arch/csky/mm/init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index 322eb7bd7962..dbb4b2dfe4b7 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -31,6 +31,7 @@
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
 pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned_bss;
+EXPORT_SYMBOL(invalid_pte_table);
 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
 						__page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);

From f8e17c17b81070f38062dce79ca7f4541851dadd Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 17 Dec 2019 11:12:55 +0800
Subject: [PATCH 114/239] csky: Set regs->usp to kernel sp, when the exception
 is from kernel

In the past, we didn't care about kernel sp when saving pt_reg. But in some
cases, we still need pt_reg->usp to represent the kernel stack before enter
exception.

For cmpxhg in atomic.S, we need save and restore usp for above.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/abiv1/inc/abi/entry.h | 19 ++++++++++++++-----
 arch/csky/abiv2/inc/abi/entry.h | 11 +++++++++++
 arch/csky/kernel/atomic.S       |  8 ++++++--
 3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/arch/csky/abiv1/inc/abi/entry.h b/arch/csky/abiv1/inc/abi/entry.h
index 7ab78bd0f3b1..f35a9f3315ee 100644
--- a/arch/csky/abiv1/inc/abi/entry.h
+++ b/arch/csky/abiv1/inc/abi/entry.h
@@ -16,14 +16,16 @@
 #define LSAVE_A4	40
 #define LSAVE_A5	44
 
+#define usp ss1
+
 .macro USPTOKSP
-	mtcr	sp, ss1
+	mtcr	sp, usp
 	mfcr	sp, ss0
 .endm
 
 .macro KSPTOUSP
 	mtcr	sp, ss0
-	mfcr	sp, ss1
+	mfcr	sp, usp
 .endm
 
 .macro	SAVE_ALL epc_inc
@@ -45,7 +47,13 @@
 	add	lr, r13
 	stw     lr, (sp, 8)
 
+	mov	lr, sp
+	addi	lr, 32
+	addi	lr, 32
+	addi	lr, 16
+	bt	2f
 	mfcr	lr, ss1
+2:
 	stw     lr, (sp, 16)
 
 	stw     a0, (sp, 20)
@@ -79,9 +87,10 @@
 	ldw     a0, (sp, 12)
 	mtcr    a0, epsr
 	btsti   a0, 31
+	bt      1f
 	ldw     a0, (sp, 16)
 	mtcr	a0, ss1
-
+1:
 	ldw     a0, (sp, 24)
 	ldw     a1, (sp, 28)
 	ldw     a2, (sp, 32)
@@ -102,9 +111,9 @@
 	addi	sp, 32
 	addi	sp, 8
 
-	bt      1f
+	bt      2f
 	KSPTOUSP
-1:
+2:
 	rte
 .endm
 
diff --git a/arch/csky/abiv2/inc/abi/entry.h b/arch/csky/abiv2/inc/abi/entry.h
index 9897a16b45e5..94a7a58765df 100644
--- a/arch/csky/abiv2/inc/abi/entry.h
+++ b/arch/csky/abiv2/inc/abi/entry.h
@@ -31,7 +31,13 @@
 
 	mfcr	lr, epsr
 	stw	lr, (sp, 12)
+	btsti   lr, 31
+	bf      1f
+	addi    lr, sp, 152
+	br	2f
+1:
 	mfcr	lr, usp
+2:
 	stw	lr, (sp, 16)
 
 	stw     a0, (sp, 20)
@@ -64,8 +70,10 @@
 	mtcr	a0, epc
 	ldw	a0, (sp, 12)
 	mtcr	a0, epsr
+	btsti   a0, 31
 	ldw	a0, (sp, 16)
 	mtcr	a0, usp
+	mtcr	a0, ss0
 
 #ifdef CONFIG_CPU_HAS_HILO
 	ldw	a0, (sp, 140)
@@ -86,6 +94,9 @@
 	addi    sp, 40
 	ldm     r16-r30, (sp)
 	addi    sp, 72
+	bf	1f
+	mfcr	sp, ss0
+1:
 	rte
 .endm
 
diff --git a/arch/csky/kernel/atomic.S b/arch/csky/kernel/atomic.S
index 5b84f11485ae..3821ef9b7567 100644
--- a/arch/csky/kernel/atomic.S
+++ b/arch/csky/kernel/atomic.S
@@ -17,10 +17,12 @@ ENTRY(csky_cmpxchg)
 	mfcr	a3, epc
 	addi	a3, TRAP0_SIZE
 
-	subi    sp, 8
+	subi    sp, 16
 	stw     a3, (sp, 0)
 	mfcr    a3, epsr
 	stw     a3, (sp, 4)
+	mfcr	a3, usp
+	stw     a3, (sp, 8)
 
 	psrset	ee
 #ifdef CONFIG_CPU_HAS_LDSTEX
@@ -47,7 +49,9 @@ ENTRY(csky_cmpxchg)
 	mtcr	a3, epc
 	ldw     a3, (sp, 4)
 	mtcr	a3, epsr
-	addi	sp, 8
+	ldw     a3, (sp, 8)
+	mtcr	a3, usp
+	addi	sp, 16
 	KSPTOUSP
 	rte
 END(csky_cmpxchg)

From c9492737b25ca32679ba3163609d938c9abfd508 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 7 Jan 2020 12:21:25 +0800
Subject: [PATCH 115/239] csky/smp: Fixup boot failed when CONFIG_SMP

If we use a non-ipi-support interrupt controller, it will cause panic here.
We should let cpu up and work with CONFIG_SMP, when we use a non-ipi intc.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index b753d382e4ce..0bb0954d5570 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -120,7 +120,7 @@ void __init setup_smp_ipi(void)
 	int rc;
 
 	if (ipi_irq == 0)
-		panic("%s IRQ mapping failed\n", __func__);
+		return;
 
 	rc = request_percpu_irq(ipi_irq, handle_ipi, "IPI Interrupt",
 				&ipi_dummy_dev);

From a736fa1ed772e3640e1bfaab36032c5a285d6a7b Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sat, 11 Jan 2020 13:44:32 +0800
Subject: [PATCH 116/239] csky/Kconfig: Add Kconfig.platforms to support some
 drivers

Such as snps,dw-apb-ictl

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig           | 2 ++
 arch/csky/Kconfig.platforms | 9 +++++++++
 2 files changed, 11 insertions(+)
 create mode 100644 arch/csky/Kconfig.platforms

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 00ac7bc5558a..1c723cb11cc4 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -270,4 +270,6 @@ config HOTPLUG_CPU
 	  Say N if you want to disable CPU hotplug.
 endmenu
 
+source "arch/csky/Kconfig.platforms"
+
 source "kernel/Kconfig.hz"
diff --git a/arch/csky/Kconfig.platforms b/arch/csky/Kconfig.platforms
new file mode 100644
index 000000000000..639e17f4eacb
--- /dev/null
+++ b/arch/csky/Kconfig.platforms
@@ -0,0 +1,9 @@
+menu "Platform drivers selection"
+
+config ARCH_CSKY_DW_APB_ICTL
+	bool "Select dw-apb interrupt controller"
+	select DW_APB_ICTL
+	default y
+	help
+	  This enables support for snps dw-apb-ictl
+endmenu

From 761b4f694cb90b63ca2739ac8a8a176342636e5e Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Wed, 22 Jan 2020 11:15:14 +0800
Subject: [PATCH 117/239] csky: Support icache flush without specific
 instructions

Some CPUs don't support icache specific instructions to flush icache
lines in broadcast way. We use cpu control registers to flush local
icache and use IPI to notify other cores.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig             |  4 ++++
 arch/csky/include/asm/cache.h |  1 +
 arch/csky/mm/cachev1.c        |  5 +++++
 arch/csky/mm/cachev2.c        | 31 ++++++++++++++++++++++---------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 1c723cb11cc4..111474b50f69 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -231,6 +231,10 @@ config CPU_HAS_FPU
 	bool "CPU has FPU coprocessor"
 	depends on CPU_CK807 || CPU_CK810 || CPU_CK860
 
+config CPU_HAS_ICACHE_INS
+	bool "CPU has Icache invalidate instructions"
+	depends on CPU_HAS_CACHEV2
+
 config CPU_HAS_TEE
 	bool "CPU has Trusted Execution Environment"
 	depends on CPU_CK810
diff --git a/arch/csky/include/asm/cache.h b/arch/csky/include/asm/cache.h
index 1d5fc2f78fd7..4b5c09bf1d25 100644
--- a/arch/csky/include/asm/cache.h
+++ b/arch/csky/include/asm/cache.h
@@ -16,6 +16,7 @@ void dcache_wb_line(unsigned long start);
 
 void icache_inv_range(unsigned long start, unsigned long end);
 void icache_inv_all(void);
+void local_icache_inv_all(void *priv);
 
 void dcache_wb_range(unsigned long start, unsigned long end);
 void dcache_wbinv_all(void);
diff --git a/arch/csky/mm/cachev1.c b/arch/csky/mm/cachev1.c
index 494ec912abff..5a5a9804a0e3 100644
--- a/arch/csky/mm/cachev1.c
+++ b/arch/csky/mm/cachev1.c
@@ -94,6 +94,11 @@ void icache_inv_all(void)
 	cache_op_all(INS_CACHE|CACHE_INV, 0);
 }
 
+void local_icache_inv_all(void *priv)
+{
+	cache_op_all(INS_CACHE|CACHE_INV, 0);
+}
+
 void dcache_wb_range(unsigned long start, unsigned long end)
 {
 	cache_op_range(start, end, DATA_CACHE|CACHE_CLR, 0);
diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c
index b61be6518e21..909fdd0f5995 100644
--- a/arch/csky/mm/cachev2.c
+++ b/arch/csky/mm/cachev2.c
@@ -3,15 +3,25 @@
 
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/mm.h>
 #include <asm/cache.h>
 #include <asm/barrier.h>
 
-inline void dcache_wb_line(unsigned long start)
+#define INS_CACHE		(1 << 0)
+#define CACHE_INV		(1 << 4)
+
+void local_icache_inv_all(void *priv)
 {
-	asm volatile("dcache.cval1 %0\n"::"r"(start):"memory");
+	mtcr("cr17", INS_CACHE|CACHE_INV);
 	sync_is();
 }
 
+void icache_inv_all(void)
+{
+	on_each_cpu(local_icache_inv_all, NULL, 1);
+}
+
+#ifdef CONFIG_CPU_HAS_ICACHE_INS
 void icache_inv_range(unsigned long start, unsigned long end)
 {
 	unsigned long i = start & ~(L1_CACHE_BYTES - 1);
@@ -20,10 +30,16 @@ void icache_inv_range(unsigned long start, unsigned long end)
 		asm volatile("icache.iva %0\n"::"r"(i):"memory");
 	sync_is();
 }
-
-void icache_inv_all(void)
+#else
+void icache_inv_range(unsigned long start, unsigned long end)
 {
-	asm volatile("icache.ialls\n":::"memory");
+	icache_inv_all();
+}
+#endif
+
+inline void dcache_wb_line(unsigned long start)
+{
+	asm volatile("dcache.cval1 %0\n"::"r"(start):"memory");
 	sync_is();
 }
 
@@ -53,10 +69,7 @@ void cache_wbinv_range(unsigned long start, unsigned long end)
 		asm volatile("dcache.cval1 %0\n"::"r"(i):"memory");
 	sync_is();
 
-	i = start & ~(L1_CACHE_BYTES - 1);
-	for (; i < end; i += L1_CACHE_BYTES)
-		asm volatile("icache.iva %0\n"::"r"(i):"memory");
-	sync_is();
+	icache_inv_range(start, end);
 }
 EXPORT_SYMBOL(cache_wbinv_range);
 

From a1176734132c630b50908c36563e05fb3599682c Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sat, 25 Jan 2020 00:37:09 +0800
Subject: [PATCH 118/239] csky: Remove unnecessary flush_icache_*
 implementation

The abiv2 CPUs are all PIPT cache, so there is no need to implement
flush_icache_page function.

The function flush_icache_user_range hasn't been used, so just
remove it.

The function flush_cache_range is not necessary for PIPT cache when
tlb mapping changed.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/abiv1/inc/abi/cacheflush.h |  3 ---
 arch/csky/abiv2/cacheflush.c         | 23 -----------------------
 arch/csky/abiv2/inc/abi/cacheflush.h | 13 ++-----------
 3 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index 79ef9e8c1afd..a73702704f38 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -49,9 +49,6 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u
 #define flush_icache_page(vma, page)		do {} while (0);
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
 
-#define flush_icache_user_range(vma,page,addr,len) \
-	flush_dcache_page(page)
-
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 do { \
 	memcpy(dst, src, len); \
diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index 5bb887b275e1..f64b415f6fde 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -6,29 +6,6 @@
 #include <linux/mm.h>
 #include <asm/cache.h>
 
-void flush_icache_page(struct vm_area_struct *vma, struct page *page)
-{
-	unsigned long start;
-
-	start = (unsigned long) kmap_atomic(page);
-
-	cache_wbinv_range(start, start + PAGE_SIZE);
-
-	kunmap_atomic((void *)start);
-}
-
-void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
-			     unsigned long vaddr, int len)
-{
-	unsigned long kaddr;
-
-	kaddr = (unsigned long) kmap_atomic(page) + (vaddr & ~PAGE_MASK);
-
-	cache_wbinv_range(kaddr, kaddr + len);
-
-	kunmap_atomic((void *)kaddr);
-}
-
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *pte)
 {
diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index b8db5e0b2fe3..62a9031fffd8 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -13,25 +13,16 @@
 #define flush_cache_all()			do { } while (0)
 #define flush_cache_mm(mm)			do { } while (0)
 #define flush_cache_dup_mm(mm)			do { } while (0)
-
-#define flush_cache_range(vma, start, end) \
-	do { \
-		if (vma->vm_flags & VM_EXEC) \
-			icache_inv_all(); \
-	} while (0)
-
+#define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
 #define flush_dcache_page(page)			do { } while (0)
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
+#define flush_icache_page(vma, page)		do { } while (0)
 
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
 
-void flush_icache_page(struct vm_area_struct *vma, struct page *page);
-void flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
-			     unsigned long vaddr, int len);
-
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 

From d936a7e708dcf22344c4420e8b0e36f5d5f8c073 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Mon, 27 Jan 2020 01:20:36 +0800
Subject: [PATCH 119/239] csky: Enable defer flush_dcache_page for abiv2 cpus
 (807/810/860)

Instead of flushing cache per update_mmu_cache() called, we use
flush_dcache_page to reduce the frequency of flashing the cache.

As abiv2 cpus are all PIPT for icache & dcache, we needn't handle
dcache aliasing problem. But their icache can't snoop dcache, so
we still need sync_icache_dcache in update_mmu_cache().

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/abiv2/cacheflush.c         | 14 ++++++++------
 arch/csky/abiv2/inc/abi/cacheflush.h | 12 ++++++++++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index f64b415f6fde..ba469953a16e 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -9,20 +9,22 @@
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 		      pte_t *pte)
 {
-	unsigned long addr, pfn;
+	unsigned long addr;
 	struct page *page;
 
-	pfn = pte_pfn(*pte);
-	if (unlikely(!pfn_valid(pfn)))
+	page = pfn_to_page(pte_pfn(*pte));
+	if (page == ZERO_PAGE(0))
 		return;
 
-	page = pfn_to_page(pfn);
-	if (page == ZERO_PAGE(0))
+	if (test_and_set_bit(PG_dcache_clean, &page->flags))
 		return;
 
 	addr = (unsigned long) kmap_atomic(page);
 
-	cache_wbinv_range(addr, addr + PAGE_SIZE);
+	dcache_wb_range(addr, addr + PAGE_SIZE);
+
+	if (vma->vm_flags & VM_EXEC)
+		icache_inv_range(addr, addr + PAGE_SIZE);
 
 	kunmap_atomic((void *) addr);
 }
diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index 62a9031fffd8..acd7c6c55d61 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -15,8 +15,16 @@
 #define flush_cache_dup_mm(mm)			do { } while (0)
 #define flush_cache_range(vma, start, end)	do { } while (0)
 #define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
-#define flush_dcache_page(page)			do { } while (0)
+
+#define PG_dcache_clean		PG_arch_1
+
+#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
+static inline void flush_dcache_page(struct page *page)
+{
+	if (test_bit(PG_dcache_clean, &page->flags))
+		clear_bit(PG_dcache_clean, &page->flags);
+}
+
 #define flush_dcache_mmap_lock(mapping)		do { } while (0)
 #define flush_dcache_mmap_unlock(mapping)	do { } while (0)
 #define flush_icache_page(vma, page)		do { } while (0)

From cc1f6563a92ced0889775d0587316d725b6e1a68 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Mon, 27 Jan 2020 19:57:29 +0800
Subject: [PATCH 120/239] csky: Optimize abiv2 copy_to_user_page with VM_EXEC

Only when vma is for VM_EXEC, we need sync dcache & icache. eg:
 - gdb ptrace modify user space instruction code area.

Add VM_EXEC condition to reduce unnecessary cache flush.

The abiv1 cpus' cache are all VIPT, so we still need to deal with
dcache aliasing problem. But there is optimized way to use cache
color, just like what's done in arch/csky/abiv1/inc/abi/page.h.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/abiv2/inc/abi/cacheflush.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index acd7c6c55d61..28b7c3233175 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -37,7 +37,9 @@ static inline void flush_dcache_page(struct page *page)
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
 do { \
 	memcpy(dst, src, len); \
-	cache_wbinv_range((unsigned long)dst, (unsigned long)dst + len); \
+	if (vma->vm_flags & VM_EXEC) \
+		cache_wbinv_range((unsigned long)dst, \
+				  (unsigned long)dst + len); \
 } while (0)
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy(dst, src, len)

From 997153b9a75c08d545ad45e6f8ceb432435d2425 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Fri, 31 Jan 2020 20:33:10 +0800
Subject: [PATCH 121/239] csky: Add flush_icache_mm to defer flush icache all

Some CPUs don't support icache.va instruction to maintain the whole
smp cores' icache. Using icache.all + IPI casue a lot on performace
and using defer mechanism could reduce the number of calling icache
_flush_all functions.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/abiv1/inc/abi/cacheflush.h |  2 +
 arch/csky/abiv2/cacheflush.c         | 55 ++++++++++++++++++++++++++++
 arch/csky/abiv2/inc/abi/cacheflush.h | 14 +++++--
 arch/csky/include/asm/cacheflush.h   |  1 +
 arch/csky/include/asm/mmu.h          |  1 +
 arch/csky/include/asm/mmu_context.h  |  2 +
 arch/csky/mm/syscache.c              | 13 +++----
 7 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h
index a73702704f38..d3e04208d53c 100644
--- a/arch/csky/abiv1/inc/abi/cacheflush.h
+++ b/arch/csky/abiv1/inc/abi/cacheflush.h
@@ -48,6 +48,8 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u
 
 #define flush_icache_page(vma, page)		do {} while (0);
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
+#define flush_icache_mm_range(mm, start, end)	cache_wbinv_range(start, end)
+#define flush_icache_deferred(mm)		do {} while (0);
 
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 do { \
diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c
index ba469953a16e..790f1ebfba44 100644
--- a/arch/csky/abiv2/cacheflush.c
+++ b/arch/csky/abiv2/cacheflush.c
@@ -28,3 +28,58 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
 
 	kunmap_atomic((void *) addr);
 }
+
+void flush_icache_deferred(struct mm_struct *mm)
+{
+	unsigned int cpu = smp_processor_id();
+	cpumask_t *mask = &mm->context.icache_stale_mask;
+
+	if (cpumask_test_cpu(cpu, mask)) {
+		cpumask_clear_cpu(cpu, mask);
+		/*
+		 * Ensure the remote hart's writes are visible to this hart.
+		 * This pairs with a barrier in flush_icache_mm.
+		 */
+		smp_mb();
+		local_icache_inv_all(NULL);
+	}
+}
+
+void flush_icache_mm_range(struct mm_struct *mm,
+		unsigned long start, unsigned long end)
+{
+	unsigned int cpu;
+	cpumask_t others, *mask;
+
+	preempt_disable();
+
+#ifdef CONFIG_CPU_HAS_ICACHE_INS
+	if (mm == current->mm) {
+		icache_inv_range(start, end);
+		preempt_enable();
+		return;
+	}
+#endif
+
+	/* Mark every hart's icache as needing a flush for this MM. */
+	mask = &mm->context.icache_stale_mask;
+	cpumask_setall(mask);
+
+	/* Flush this hart's I$ now, and mark it as flushed. */
+	cpu = smp_processor_id();
+	cpumask_clear_cpu(cpu, mask);
+	local_icache_inv_all(NULL);
+
+	/*
+	 * Flush the I$ of other harts concurrently executing, and mark them as
+	 * flushed.
+	 */
+	cpumask_andnot(&others, mm_cpumask(mm), cpumask_of(cpu));
+
+	if (mm != current->active_mm || !cpumask_empty(&others)) {
+		on_each_cpu_mask(&others, local_icache_inv_all, NULL, 1);
+		cpumask_clear(mask);
+	}
+
+	preempt_enable();
+}
diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h
index 28b7c3233175..a565e00c3f70 100644
--- a/arch/csky/abiv2/inc/abi/cacheflush.h
+++ b/arch/csky/abiv2/inc/abi/cacheflush.h
@@ -31,15 +31,23 @@ static inline void flush_dcache_page(struct page *page)
 
 #define flush_icache_range(start, end)		cache_wbinv_range(start, end)
 
+void flush_icache_mm_range(struct mm_struct *mm,
+			unsigned long start, unsigned long end);
+void flush_icache_deferred(struct mm_struct *mm);
+
 #define flush_cache_vmap(start, end)		do { } while (0)
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
 do { \
 	memcpy(dst, src, len); \
-	if (vma->vm_flags & VM_EXEC) \
-		cache_wbinv_range((unsigned long)dst, \
-				  (unsigned long)dst + len); \
+	if (vma->vm_flags & VM_EXEC) { \
+		dcache_wb_range((unsigned long)dst, \
+				(unsigned long)dst + len); \
+		flush_icache_mm_range(current->mm, \
+				(unsigned long)dst, \
+				(unsigned long)dst + len); \
+		} \
 } while (0)
 #define copy_from_user_page(vma, page, vaddr, dst, src, len) \
 	memcpy(dst, src, len)
diff --git a/arch/csky/include/asm/cacheflush.h b/arch/csky/include/asm/cacheflush.h
index a96da67261ae..f0b8f25429a2 100644
--- a/arch/csky/include/asm/cacheflush.h
+++ b/arch/csky/include/asm/cacheflush.h
@@ -4,6 +4,7 @@
 #ifndef __ASM_CSKY_CACHEFLUSH_H
 #define __ASM_CSKY_CACHEFLUSH_H
 
+#include <linux/mm.h>
 #include <abi/cacheflush.h>
 
 #endif /* __ASM_CSKY_CACHEFLUSH_H */
diff --git a/arch/csky/include/asm/mmu.h b/arch/csky/include/asm/mmu.h
index b382a14ea4ec..26fbb1d15df0 100644
--- a/arch/csky/include/asm/mmu.h
+++ b/arch/csky/include/asm/mmu.h
@@ -7,6 +7,7 @@
 typedef struct {
 	atomic64_t	asid;
 	void *vdso;
+	cpumask_t	icache_stale_mask;
 } mm_context_t;
 
 #endif /* __ASM_CSKY_MMU_H */
diff --git a/arch/csky/include/asm/mmu_context.h b/arch/csky/include/asm/mmu_context.h
index 0285b0ad18b6..abdf1f1cb6ec 100644
--- a/arch/csky/include/asm/mmu_context.h
+++ b/arch/csky/include/asm/mmu_context.h
@@ -43,5 +43,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	TLBMISS_HANDLER_SETUP_PGD(next->pgd);
 	write_mmu_entryhi(next->context.asid.counter);
+
+	flush_icache_deferred(next);
 }
 #endif /* __ASM_CSKY_MMU_CONTEXT_H */
diff --git a/arch/csky/mm/syscache.c b/arch/csky/mm/syscache.c
index c4645e4e97f4..ffade2f9a4c8 100644
--- a/arch/csky/mm/syscache.c
+++ b/arch/csky/mm/syscache.c
@@ -3,7 +3,7 @@
 
 #include <linux/syscalls.h>
 #include <asm/page.h>
-#include <asm/cache.h>
+#include <asm/cacheflush.h>
 #include <asm/cachectl.h>
 
 SYSCALL_DEFINE3(cacheflush,
@@ -13,17 +13,14 @@ SYSCALL_DEFINE3(cacheflush,
 {
 	switch (cache) {
 	case ICACHE:
-		icache_inv_range((unsigned long)addr,
-				 (unsigned long)addr + bytes);
-		break;
+	case BCACHE:
+		flush_icache_mm_range(current->mm,
+				(unsigned long)addr,
+				(unsigned long)addr + bytes);
 	case DCACHE:
 		dcache_wb_range((unsigned long)addr,
 				(unsigned long)addr + bytes);
 		break;
-	case BCACHE:
-		cache_wbinv_range((unsigned long)addr,
-				  (unsigned long)addr + bytes);
-		break;
 	default:
 		return -EINVAL;
 	}

From 359ae00d12589c31cf103894d0f32588d523ca83 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sun, 2 Feb 2020 09:58:42 +0800
Subject: [PATCH 122/239] csky: Fixup ftrace modify panic

During ftrace init, linux will replace all function prologues
(call_mcout) with nops, but it need flush_dcache and
invalidate_icache to make it work. So flush_cache functions
couldn't be nested called by ftrace framework.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/mm/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/csky/mm/Makefile b/arch/csky/mm/Makefile
index 2c51918eb4fc..6e7696e55f71 100644
--- a/arch/csky/mm/Makefile
+++ b/arch/csky/mm/Makefile
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0-only
 ifeq ($(CONFIG_CPU_HAS_CACHEV2),y)
 obj-y +=			cachev2.o
+CFLAGS_REMOVE_cachev2.o = $(CC_FLAGS_FTRACE)
 else
 obj-y +=			cachev1.o
+CFLAGS_REMOVE_cachev1.o = $(CC_FLAGS_FTRACE)
 endif
 
 obj-y +=			dma-mapping.o

From 9025fd48a8aeddade845afa353d4bbab7f19dbf2 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sun, 2 Feb 2020 10:58:38 +0800
Subject: [PATCH 123/239] csky: Remove unused cache implementation

Only for coding convention, these codes are unnecessary for abiv2.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/mm/cachev2.c | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/csky/mm/cachev2.c b/arch/csky/mm/cachev2.c
index 909fdd0f5995..bc419f8039d3 100644
--- a/arch/csky/mm/cachev2.c
+++ b/arch/csky/mm/cachev2.c
@@ -52,23 +52,9 @@ void dcache_wb_range(unsigned long start, unsigned long end)
 	sync_is();
 }
 
-void dcache_inv_range(unsigned long start, unsigned long end)
-{
-	unsigned long i = start & ~(L1_CACHE_BYTES - 1);
-
-	for (; i < end; i += L1_CACHE_BYTES)
-		asm volatile("dcache.civa %0\n"::"r"(i):"memory");
-	sync_is();
-}
-
 void cache_wbinv_range(unsigned long start, unsigned long end)
 {
-	unsigned long i = start & ~(L1_CACHE_BYTES - 1);
-
-	for (; i < end; i += L1_CACHE_BYTES)
-		asm volatile("dcache.cval1 %0\n"::"r"(i):"memory");
-	sync_is();
-
+	dcache_wb_range(start, end);
 	icache_inv_range(start, end);
 }
 EXPORT_SYMBOL(cache_wbinv_range);

From 2305f60b76110cb3e8658a4ae85d1f7eb0c66a5b Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Sun, 2 Feb 2020 14:11:11 +0800
Subject: [PATCH 124/239] csky: Fixup compile warning for three unimplemented
 syscalls

Implement fstat64, fstatat64, clone3 syscalls to fixup
checksyscalls.sh compile warnings.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/include/uapi/asm/unistd.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/csky/include/uapi/asm/unistd.h b/arch/csky/include/uapi/asm/unistd.h
index 211c983c7282..ba4018929733 100644
--- a/arch/csky/include/uapi/asm/unistd.h
+++ b/arch/csky/include/uapi/asm/unistd.h
@@ -1,7 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
+#define __ARCH_WANT_STAT64
+#define __ARCH_WANT_NEW_STAT
 #define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
 #define __ARCH_WANT_SET_GET_RLIMIT
 #define __ARCH_WANT_TIME32_SYSCALLS
 #include <asm-generic/unistd.h>

From bebd26ab623616728d6e72b5c74a47bfff5287d8 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 31 Jan 2020 17:52:30 -0800
Subject: [PATCH 125/239] arch/csky: fix some Kconfig typos

Fix wording in help text for the CPU_HAS_LDSTEX symbol.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 111474b50f69..f1e73543feec 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -76,7 +76,7 @@ config CPU_HAS_TLBI
 config CPU_HAS_LDSTEX
 	bool
 	help
-	  For SMP, CPU needs "ldex&stex" instrcutions to atomic operations.
+	  For SMP, CPU needs "ldex&stex" instructions for atomic operations.
 
 config CPU_NEED_TLBSYNC
 	bool

From 4ec575b78521b4573e85aec451930cb040582455 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 30 Jan 2020 20:22:40 +0100
Subject: [PATCH 126/239] csky: Cleanup old Kconfig options

CONFIG_CLKSRC_OF is gone since commit bb0eb050a577
("clocksource/drivers: Rename CLKSRC_OF to TIMER_OF").  The platform
already selects TIMER_OF.

CONFIG_HAVE_DMA_API_DEBUG is gone since commit 6e88628d03dd ("dma-debug:
remove CONFIG_HAVE_DMA_API_DEBUG").

CONFIG_DEFAULT_DEADLINE is gone since commit f382fb0bcef4 ("block:
remove legacy IO schedulers").

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig           | 2 --
 arch/csky/configs/defconfig | 1 -
 2 files changed, 3 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index f1e73543feec..bf246b036dee 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -9,7 +9,6 @@ config CSKY
 	select ARCH_USE_QUEUED_RWLOCKS if NR_CPUS>2
 	select COMMON_CLK
 	select CLKSRC_MMIO
-	select CLKSRC_OF
 	select CSKY_MPINTC if CPU_CK860
 	select CSKY_MP_TIMER if CPU_CK860
 	select CSKY_APB_INTC
@@ -47,7 +46,6 @@ config CSKY
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_DMA_API_DEBUG
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/csky/configs/defconfig b/arch/csky/configs/defconfig
index 7ef42895dfb0..3b540539dbe6 100644
--- a/arch/csky/configs/defconfig
+++ b/arch/csky/configs/defconfig
@@ -10,7 +10,6 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_DEFAULT_DEADLINE=y
 CONFIG_CPU_CK807=y
 CONFIG_CPU_HAS_FPU=y
 CONFIG_NET=y

From d46869aaab7981f9153d3271228005a52ef18591 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Tue, 8 Oct 2019 14:25:13 +0800
Subject: [PATCH 127/239] csky: Add setup_initrd check code

We should give some necessary check for initrd just like other
architectures and it seems that setup_initrd() could be a common
code for all architectures.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/kernel/setup.c |  3 ---
 arch/csky/mm/init.c      | 44 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 4fc7b16b77d2..3821e55742f4 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -47,9 +47,6 @@ static void __init csky_memblock_init(void)
 	signed long size;
 
 	memblock_reserve(__pa(_stext), _end - _stext);
-#ifdef CONFIG_BLK_DEV_INITRD
-	memblock_reserve(__pa(initrd_start), initrd_end - initrd_start);
-#endif
 
 	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c
index dbb4b2dfe4b7..cb64d8647a78 100644
--- a/arch/csky/mm/init.c
+++ b/arch/csky/mm/init.c
@@ -19,6 +19,7 @@
 #include <linux/swap.h>
 #include <linux/proc_fs.h>
 #include <linux/pfn.h>
+#include <linux/initrd.h>
 
 #include <asm/setup.h>
 #include <asm/cachectl.h>
@@ -36,6 +37,45 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
 						__page_aligned_bss;
 EXPORT_SYMBOL(empty_zero_page);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+static void __init setup_initrd(void)
+{
+	unsigned long size;
+
+	if (initrd_start >= initrd_end) {
+		pr_err("initrd not found or empty");
+		goto disable;
+	}
+
+	if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) {
+		pr_err("initrd extends beyond end of memory");
+		goto disable;
+	}
+
+	size = initrd_end - initrd_start;
+
+	if (memblock_is_region_reserved(__pa(initrd_start), size)) {
+		pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region",
+		       __pa(initrd_start), size);
+		goto disable;
+	}
+
+	memblock_reserve(__pa(initrd_start), size);
+
+	pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n",
+		(void *)(initrd_start), size);
+
+	initrd_below_start_ok = 1;
+
+	return;
+
+disable:
+	initrd_start = initrd_end = 0;
+
+	pr_err(" - disabling initrd\n");
+}
+#endif
+
 void __init mem_init(void)
 {
 #ifdef CONFIG_HIGHMEM
@@ -47,6 +87,10 @@ void __init mem_init(void)
 #endif
 	high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
+#ifdef CONFIG_BLK_DEV_INITRD
+	setup_initrd();
+#endif
+
 	memblock_free_all();
 
 #ifdef CONFIG_HIGHMEM

From dc2efc0028dd5c4bf0f7324806c9a196958aaad3 Mon Sep 17 00:00:00 2001
From: Ma Jun <majun258@linux.alibaba.com>
Date: Sun, 2 Feb 2020 17:56:58 +0800
Subject: [PATCH 128/239] csky: Minimize defconfig to support buildroot
 config.fragment

Some bsp (eg: buildroot) has defconfig.fragment design to add more
configs into the defconfig in linux source code tree. For example,
we could put different cpu configs into different defconfig.fragments,
but they all use the same defconfig in Linux.

Signed-off-by: Ma Jun <majun258@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/configs/defconfig | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/csky/configs/defconfig b/arch/csky/configs/defconfig
index 3b540539dbe6..af722e4dfb47 100644
--- a/arch/csky/configs/defconfig
+++ b/arch/csky/configs/defconfig
@@ -10,8 +10,6 @@ CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
-CONFIG_CPU_CK807=y
-CONFIG_CPU_HAS_FPU=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
@@ -26,10 +24,7 @@ CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
 CONFIG_SERIAL_OF_PLATFORM=y
-CONFIG_TTY_PRINTK=y
 # CONFIG_VGA_CONSOLE is not set
-CONFIG_CSKY_MPTIMER=y
-CONFIG_GX6605S_TIMER=y
 CONFIG_PM_DEVFREQ=y
 CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y
 CONFIG_DEVFREQ_GOV_PERFORMANCE=y
@@ -55,6 +50,4 @@ CONFIG_CRAMFS=y
 CONFIG_ROMFS_FS=y
 CONFIG_NFS_FS=y
 CONFIG_PRINTK_TIME=y
-CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y

From 5b49c82dadfe0f3741778f57385f964ec1514863 Mon Sep 17 00:00:00 2001
From: MaJun <majun258@linux.alibaba.com>
Date: Mon, 27 Jan 2020 10:56:21 +0800
Subject: [PATCH 129/239] csky: Add PCI support

Add the pci related code for csky arch to support basic pci virtual
function, such as qemu virt-pci-9pfs.

Signed-off-by: MaJun <majun258@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig            |  5 +++++
 arch/csky/include/asm/Kbuild |  1 -
 arch/csky/include/asm/pci.h  | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 arch/csky/include/asm/pci.h

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index bf246b036dee..72b2999a889a 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -58,6 +58,11 @@ config CSKY
 	select TIMER_OF
 	select USB_ARCH_HAS_EHCI
 	select USB_ARCH_HAS_OHCI
+	select GENERIC_PCI_IOMAP
+	select HAVE_PCI
+	select PCI_DOMAINS_GENERIC if PCI
+	select PCI_SYSCALL if PCI
+	select PCI_MSI if PCI
 
 config CPU_HAS_CACHEV2
 	bool
diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index bc15a26c782f..4130e3eaa766 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -28,7 +28,6 @@ generic-y += local64.h
 generic-y += mm-arch-hooks.h
 generic-y += mmiowb.h
 generic-y += module.h
-generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += qrwlock.h
diff --git a/arch/csky/include/asm/pci.h b/arch/csky/include/asm/pci.h
new file mode 100644
index 000000000000..ebc765b1f78b
--- /dev/null
+++ b/arch/csky/include/asm/pci.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_CSKY_PCI_H
+#define __ASM_CSKY_PCI_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/io.h>
+
+#define PCIBIOS_MIN_IO		0
+#define PCIBIOS_MIN_MEM		0
+
+/* C-SKY shim does not initialize PCI bus */
+#define pcibios_assign_all_busses() 1
+
+extern int isa_dma_bridge_buggy;
+
+#ifdef CONFIG_PCI
+static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
+{
+	/* no legacy IRQ on csky */
+	return -ENODEV;
+}
+
+static inline int pci_proc_domain(struct pci_bus *bus)
+{
+	/* always show the domain in /proc */
+	return 1;
+}
+#endif  /* CONFIG_PCI */
+
+#endif  /* __ASM_CSKY_PCI_H */

From 0b9f386c4be6493d282aab0af6f9b70c62142777 Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Wed, 12 Feb 2020 10:24:52 +0800
Subject: [PATCH 130/239] csky: Implement copy_thread_tls

This is required for clone3 which passes the TLS value through a
struct rather than a register.

Cc: Amanieu d'Antras <amanieu@gmail.com>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/Kconfig          | 1 +
 arch/csky/kernel/process.c | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 72b2999a889a..047427f71d83 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -36,6 +36,7 @@ config CSKY
 	select GX6605S_TIMER if CPU_CK610
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_COPY_THREAD_TLS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index 5349cd8c0f30..f7b231ca269a 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -40,10 +40,11 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return sw->r15;
 }
 
-int copy_thread(unsigned long clone_flags,
+int copy_thread_tls(unsigned long clone_flags,
 		unsigned long usp,
 		unsigned long kthread_arg,
-		struct task_struct *p)
+		struct task_struct *p,
+		unsigned long tls)
 {
 	struct switch_stack *childstack;
 	struct pt_regs *childregs = task_pt_regs(p);
@@ -70,7 +71,7 @@ int copy_thread(unsigned long clone_flags,
 			childregs->usp = usp;
 		if (clone_flags & CLONE_SETTLS)
 			task_thread_info(p)->tp_value = childregs->tls
-						      = childregs->regs[0];
+						      = tls;
 
 		childregs->a0 = 0;
 		childstack->r15 = (unsigned long) ret_from_fork;

From 4c5fd3b791a06021084b42d5610400f846d206b5 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 19 Feb 2020 17:28:21 -0800
Subject: [PATCH 131/239] zonefs: fix documentation typos etc.

Fix typos, spellos, etc. in zonefs.txt.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Damien Le Moal <Damien.LeMoal@wdc.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 Documentation/filesystems/zonefs.txt | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/zonefs.txt b/Documentation/filesystems/zonefs.txt
index 935bf22031ca..d54fa98ac158 100644
--- a/Documentation/filesystems/zonefs.txt
+++ b/Documentation/filesystems/zonefs.txt
@@ -134,7 +134,7 @@ Sequential zone files can only be written sequentially, starting from the file
 end, that is, write operations can only be append writes. Zonefs makes no
 attempt at accepting random writes and will fail any write request that has a
 start offset not corresponding to the end of the file, or to the end of the last
-write issued and still in-flight (for asynchrnous I/O operations).
+write issued and still in-flight (for asynchronous I/O operations).
 
 Since dirty page writeback by the page cache does not guarantee a sequential
 write pattern, zonefs prevents buffered writes and writeable shared mappings
@@ -142,7 +142,7 @@ on sequential files. Only direct I/O writes are accepted for these files.
 zonefs relies on the sequential delivery of write I/O requests to the device
 implemented by the block layer elevator. An elevator implementing the sequential
 write feature for zoned block device (ELEVATOR_F_ZBD_SEQ_WRITE elevator feature)
-must be used. This type of elevator (e.g. mq-deadline) is the set by default
+must be used. This type of elevator (e.g. mq-deadline) is set by default
 for zoned block devices on device initialization.
 
 There are no restrictions on the type of I/O used for read operations in
@@ -196,7 +196,7 @@ additional conditions that result in I/O errors.
   may still happen in the case of a partial failure of a very large direct I/O
   operation split into multiple BIOs/requests or asynchronous I/O operations.
   If one of the write request within the set of sequential write requests
-  issued to the device fails, all write requests after queued after it will
+  issued to the device fails, all write requests queued after it will
   become unaligned and fail.
 
 * Delayed write errors: similarly to regular block devices, if the device side
@@ -207,7 +207,7 @@ additional conditions that result in I/O errors.
   causing all data to be dropped after the sector that caused the error.
 
 All I/O errors detected by zonefs are notified to the user with an error code
-return for the system call that trigered or detected the error. The recovery
+return for the system call that triggered or detected the error. The recovery
 actions taken by zonefs in response to I/O errors depend on the I/O type (read
 vs write) and on the reason for the error (bad sector, unaligned writes or zone
 condition change).
@@ -222,7 +222,7 @@ condition change).
 * A zone condition change to read-only or offline also always triggers zonefs
   I/O error recovery.
 
-Zonefs minimal I/O error recovery may change a file size and a file access
+Zonefs minimal I/O error recovery may change a file size and file access
 permissions.
 
 * File size changes:
@@ -237,7 +237,7 @@ permissions.
   A file size may also be reduced to reflect a delayed write error detected on
   fsync(): in this case, the amount of data effectively written in the zone may
   be less than originally indicated by the file inode size. After such I/O
-  error, zonefs always fixes a file inode size to reflect the amount of data
+  error, zonefs always fixes the file inode size to reflect the amount of data
   persistently stored in the file zone.
 
 * Access permission changes:
@@ -281,11 +281,11 @@ Further notes:
   permissions to read-only applies to all files. The file system is remounted
   read-only.
 * Access permission and file size changes due to the device transitioning zones
-  to the offline condition are permanent. Remounting or reformating the device
+  to the offline condition are permanent. Remounting or reformatting the device
   with mkfs.zonefs (mkzonefs) will not change back offline zone files to a good
   state.
 * File access permission changes to read-only due to the device transitioning
-  zones to the read-only condition are permanent. Remounting or reformating
+  zones to the read-only condition are permanent. Remounting or reformatting
   the device will not re-enable file write access.
 * File access permission changes implied by the remount-ro, zone-ro and
   zone-offline mount options are temporary for zones in a good condition.
@@ -301,13 +301,13 @@ Mount options
 
 zonefs define the "errors=<behavior>" mount option to allow the user to specify
 zonefs behavior in response to I/O errors, inode size inconsistencies or zone
-condition chages. The defined behaviors are as follow:
+condition changes. The defined behaviors are as follow:
 * remount-ro (default)
 * zone-ro
 * zone-offline
 * repair
 
-The I/O error actions defined for each behavior is detailed in the previous
+The I/O error actions defined for each behavior are detailed in the previous
 section.
 
 Zonefs User Space Tools

From 9951ebfcdf2b97dbb28a5d930458424341e61aa2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 21 Feb 2020 10:41:43 +0100
Subject: [PATCH 132/239] nl80211: fix potential leak in AP start

If nl80211_parse_he_obss_pd() fails, we leak the previously
allocated ACL memory. Free it in this case.

Fixes: 796e90f42b7e ("cfg80211: add support for parsing OBBS_PD attributes")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200221104142.835aba4cdd14.I1923b55ba9989c57e13978f91f40bfdc45e60cbd@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cedf17d4933f..46be40e19e7f 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4800,8 +4800,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		err = nl80211_parse_he_obss_pd(
 					info->attrs[NL80211_ATTR_HE_OBSS_PD],
 					&params.he_obss_pd);
-		if (err)
-			return err;
+		goto out;
 	}
 
 	nl80211_calculate_ap_params(&params);
@@ -4823,6 +4822,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	}
 	wdev_unlock(wdev);
 
+out:
 	kfree(params.acl);
 
 	return err;

From a7ee7d44b57c9ae174088e53a668852b7f4f452d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 21 Feb 2020 10:44:50 +0100
Subject: [PATCH 133/239] cfg80211: check reg_rule for NULL in
 handle_channel_custom()

We may end up with a NULL reg_rule after the loop in
handle_channel_custom() if the bandwidth didn't fit,
check if this is the case and bail out if so.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200221104449.3b558a50201c.I4ad3725c4dacaefd2d18d3cc65ba6d18acd5dbfe@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index fff9a74891fc..1a8218f1bbe0 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2276,7 +2276,7 @@ static void handle_channel_custom(struct wiphy *wiphy,
 			break;
 	}
 
-	if (IS_ERR(reg_rule)) {
+	if (IS_ERR_OR_NULL(reg_rule)) {
 		pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
 			 chan->center_freq);
 		if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {

From 0daa63ed4c6c4302790ce67b7a90c0997ceb7514 Mon Sep 17 00:00:00 2001
From: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Date: Fri, 21 Feb 2020 10:47:20 +0100
Subject: [PATCH 134/239] mac80211: Remove a redundant mutex unlock

The below-mentioned commit changed the code to unlock *inside*
the function, but previously the unlock was *outside*. It failed
to remove the outer unlock, however, leading to double unlock.

Fix this.

Fixes: 33483a6b88e4 ("mac80211: fix missing unlock on error in ieee80211_mark_sta_auth()")
Signed-off-by: Andrei Otcheretianski <andrei.otcheretianski@intel.com>
Link: https://lore.kernel.org/r/20200221104719.cce4741cf6eb.I671567b185c8a4c2409377e483fd149ce590f56d@changeid
[rewrite commit message to better explain what happened]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index e041af2f021a..88d7a692a965 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2959,7 +2959,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	    (auth_transaction == 2 &&
 	     ifmgd->auth_data->expected_transaction == 2)) {
 		if (!ieee80211_mark_sta_auth(sdata, bssid))
-			goto out_err;
+			return; /* ignore frame -- wait for timeout */
 	} else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE &&
 		   auth_transaction == 2) {
 		sdata_info(sdata, "SAE peer confirmed\n");
@@ -2967,10 +2967,6 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
 	}
 
 	cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len);
-	return;
- out_err:
-	mutex_unlock(&sdata->local->sta_mtx);
-	/* ignore frame -- wait for timeout */
 }
 
 #define case_WLAN(type) \

From 2546287c5fb363a0165933ae2181c92f03e701d0 Mon Sep 17 00:00:00 2001
From: Zenghui Yu <yuzenghui@huawei.com>
Date: Fri, 21 Feb 2020 10:07:25 +0800
Subject: [PATCH 135/239] genirq/irqdomain: Make sure all irq domain flags are
 distinct

This was noticed when printing debugfs for MSIs on my ARM64 server.  The
new dstate IRQD_MSI_NOMASK_QUIRK came out surprisingly while it should only
be the x86 stuff for the time being...

The new MSI quirk flag uses the same bit as IRQ_DOMAIN_NAME_ALLOCATED which
is oddly defined as bit 6 for no good reason.

Switch it to the non used bit 1.

Fixes: 6f1a4891a592 ("x86/apic/msi: Plug non-maskable MSI affinity race")
Signed-off-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20200221020725.2038-1-yuzenghui@huawei.com
---
 include/linux/irqdomain.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index b2d47571ab67..8d062e86d954 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -192,7 +192,7 @@ enum {
 	IRQ_DOMAIN_FLAG_HIERARCHY	= (1 << 0),
 
 	/* Irq domain name was allocated in __irq_domain_add() */
-	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 6),
+	IRQ_DOMAIN_NAME_ALLOCATED	= (1 << 1),
 
 	/* Irq domain is an IPI domain with virq per cpu */
 	IRQ_DOMAIN_FLAG_IPI_PER_CPU	= (1 << 2),

From 3b7830904e17202524bad1974505a9bfc718d31f Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 20 Feb 2020 13:29:53 -0700
Subject: [PATCH 136/239] nvme-multipath: Fix memory leak with ana_log_buf

kmemleak reports a memory leak with the ana_log_buf allocated by
nvme_mpath_init():

unreferenced object 0xffff888120e94000 (size 8208):
  comm "nvme", pid 6884, jiffies 4295020435 (age 78786.312s)
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
      01 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00  ................
    backtrace:
      [<00000000e2360188>] kmalloc_order+0x97/0xc0
      [<0000000079b18dd4>] kmalloc_order_trace+0x24/0x100
      [<00000000f50c0406>] __kmalloc+0x24c/0x2d0
      [<00000000f31a10b9>] nvme_mpath_init+0x23c/0x2b0
      [<000000005802589e>] nvme_init_identify+0x75f/0x1600
      [<0000000058ef911b>] nvme_loop_configure_admin_queue+0x26d/0x280
      [<00000000673774b9>] nvme_loop_create_ctrl+0x2a7/0x710
      [<00000000f1c7a233>] nvmf_dev_write+0xc66/0x10b9
      [<000000004199f8d0>] __vfs_write+0x50/0xa0
      [<0000000065466fef>] vfs_write+0xf3/0x280
      [<00000000b0db9a8b>] ksys_write+0xc6/0x160
      [<0000000082156b91>] __x64_sys_write+0x43/0x50
      [<00000000c34fbb6d>] do_syscall_64+0x77/0x2f0
      [<00000000bbc574c9>] entry_SYSCALL_64_after_hwframe+0x49/0xbe

nvme_mpath_init() is called by nvme_init_identify() which is called in
multiple places (nvme_reset_work(), nvme_passthru_end(), etc). This
means nvme_mpath_init() may be called multiple times before
nvme_mpath_uninit() (which is only called on nvme_free_ctrl()).

When nvme_mpath_init() is called multiple times, it overwrites the
ana_log_buf pointer with a new allocation, thus leaking the previous
allocation.

To fix this, free ana_log_buf before allocating a new one.

Fixes: 0d0b660f214dc490 ("nvme: add ANA support")
Cc: <stable@vger.kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 797c18337d96..a11900cf3a36 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -715,6 +715,7 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	}
 
 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+	kfree(ctrl->ana_log_buf);
 	ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 	if (!ctrl->ana_log_buf) {
 		error = -ENOMEM;

From 4e4694d8729f7cd6381f6691e8f83e378fce3160 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 21 Feb 2020 17:13:42 +0900
Subject: [PATCH 137/239] bootconfig: Prohibit re-defining value on same key

Currently, bootconfig adds a new value on the existing key to the tail of an
array. But this looks a bit confusing because an admin can easily rewrite
the original value in the same config file.

This rejects the following value re-definition.

  key = value1
  ...
  key = value2

You should rewrite value1 to value2 in this case.

Link: http://lkml.kernel.org/r/158227282199.12842.10110929876059658601.stgit@devnote2

Suggested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
[ Fixed spelling of arraies to arrays ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/admin-guide/bootconfig.rst   | 11 ++++++++++-
 lib/bootconfig.c                           | 13 ++++++++-----
 tools/bootconfig/samples/bad-samekey.bconf |  6 ++++++
 3 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 tools/bootconfig/samples/bad-samekey.bconf

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index dfeffa73dca3..57119fb69d36 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -62,7 +62,16 @@ Or more shorter, written as following::
 In both styles, same key words are automatically merged when parsing it
 at boot time. So you can append similar trees or key-values.
 
-Note that a sub-key and a value can not co-exist under a parent key.
+Same-key Values
+---------------
+
+It is prohibited that two or more values or arrays share a same-key.
+For example,::
+
+ foo = bar, baz
+ foo = qux  # !ERROR! we can not re-define same key
+
+Also, a sub-key and a value can not co-exist under a parent key.
 For example, following config is NOT allowed.::
 
  foo = value1
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 54ac623ca781..2ef304db31f2 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -581,7 +581,7 @@ static int __init __xbc_parse_keys(char *k)
 static int __init xbc_parse_kv(char **k, char *v)
 {
 	struct xbc_node *prev_parent = last_parent;
-	struct xbc_node *node, *child;
+	struct xbc_node *child;
 	char *next;
 	int c, ret;
 
@@ -590,15 +590,18 @@ static int __init xbc_parse_kv(char **k, char *v)
 		return ret;
 
 	child = xbc_node_get_child(last_parent);
-	if (child && xbc_node_is_key(child))
-		return xbc_parse_error("Value is mixed with subkey", v);
+	if (child) {
+		if (xbc_node_is_key(child))
+			return xbc_parse_error("Value is mixed with subkey", v);
+		else
+			return xbc_parse_error("Value is redefined", v);
+	}
 
 	c = __xbc_parse_value(&v, &next);
 	if (c < 0)
 		return c;
 
-	node = xbc_add_sibling(v, XBC_VALUE);
-	if (!node)
+	if (!xbc_add_sibling(v, XBC_VALUE))
 		return -ENOMEM;
 
 	if (c == ',') {	/* Array */
diff --git a/tools/bootconfig/samples/bad-samekey.bconf b/tools/bootconfig/samples/bad-samekey.bconf
new file mode 100644
index 000000000000..e8d983a4563c
--- /dev/null
+++ b/tools/bootconfig/samples/bad-samekey.bconf
@@ -0,0 +1,6 @@
+# Same key value is not allowed
+key {
+	foo = value
+	bar = value2
+}
+key.foo = value

From 5f811c57c99205e048926293bb812c750a6ea562 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 21 Feb 2020 17:13:52 +0900
Subject: [PATCH 138/239] bootconfig: Add append value operator support

Add append value operator "+=" support to bootconfig syntax.
With this operator, user can add new value to the key as
an entry of array instead of overwriting.
For example,

  foo = bar
  ...
  foo += baz

Then the key "foo" has "bar" and "baz" values as an array.

Link: http://lkml.kernel.org/r/158227283195.12842.8310503105963275584.stgit@devnote2

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/admin-guide/bootconfig.rst | 10 +++++++++-
 lib/bootconfig.c                         | 15 +++++++++++----
 tools/bootconfig/test-bootconfig.sh      | 16 ++++++++++++++--
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index 57119fb69d36..cf2edcd09183 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -71,7 +71,15 @@ For example,::
  foo = bar, baz
  foo = qux  # !ERROR! we can not re-define same key
 
-Also, a sub-key and a value can not co-exist under a parent key.
+If you want to append the value to existing key as an array member,
+you can use ``+=`` operator. For example::
+
+ foo = bar, baz
+ foo += qux
+
+In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``.
+
+However, a sub-key and a value can not co-exist under a parent key.
 For example, following config is NOT allowed.::
 
  foo = value1
diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 2ef304db31f2..ec3ce7fd299f 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -578,7 +578,7 @@ static int __init __xbc_parse_keys(char *k)
 	return __xbc_add_key(k);
 }
 
-static int __init xbc_parse_kv(char **k, char *v)
+static int __init xbc_parse_kv(char **k, char *v, int op)
 {
 	struct xbc_node *prev_parent = last_parent;
 	struct xbc_node *child;
@@ -593,7 +593,7 @@ static int __init xbc_parse_kv(char **k, char *v)
 	if (child) {
 		if (xbc_node_is_key(child))
 			return xbc_parse_error("Value is mixed with subkey", v);
-		else
+		else if (op == '=')
 			return xbc_parse_error("Value is redefined", v);
 	}
 
@@ -774,7 +774,7 @@ int __init xbc_init(char *buf)
 
 	p = buf;
 	do {
-		q = strpbrk(p, "{}=;\n#");
+		q = strpbrk(p, "{}=+;\n#");
 		if (!q) {
 			p = skip_spaces(p);
 			if (*p != '\0')
@@ -785,8 +785,15 @@ int __init xbc_init(char *buf)
 		c = *q;
 		*q++ = '\0';
 		switch (c) {
+		case '+':
+			if (*q++ != '=') {
+				ret = xbc_parse_error("Wrong '+' operator",
+							q - 2);
+				break;
+			}
+			/* Fall through */
 		case '=':
-			ret = xbc_parse_kv(&p, q);
+			ret = xbc_parse_kv(&p, q, c);
 			break;
 		case '{':
 			ret = xbc_open_brace(&p, q);
diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh
index adafb7c50940..1411f4c3454f 100755
--- a/tools/bootconfig/test-bootconfig.sh
+++ b/tools/bootconfig/test-bootconfig.sh
@@ -9,7 +9,7 @@ TEMPCONF=`mktemp temp-XXXX.bconf`
 NG=0
 
 cleanup() {
-  rm -f $INITRD $TEMPCONF
+  rm -f $INITRD $TEMPCONF $OUTFILE
   exit $NG
 }
 
@@ -71,7 +71,6 @@ printf " \0\0\0 \0\0\0" >> $INITRD
 $BOOTCONF -a $TEMPCONF $INITRD > $OUTFILE 2>&1
 xfail grep -i "failed" $OUTFILE
 xfail grep -i "error" $OUTFILE
-rm $OUTFILE
 
 echo "Max node number check"
 
@@ -96,6 +95,19 @@ truncate -s 32764 $TEMPCONF
 echo "\"" >> $TEMPCONF	# add 2 bytes + terminal ('\"\n\0')
 xpass $BOOTCONF -a $TEMPCONF $INITRD
 
+echo "Adding same-key values"
+cat > $TEMPCONF << EOF
+key = bar, baz
+key += qux
+EOF
+echo > $INITRD
+
+xpass $BOOTCONF -a $TEMPCONF $INITRD
+$BOOTCONF $INITRD > $OUTFILE
+xpass grep -q "bar" $OUTFILE
+xpass grep -q "baz" $OUTFILE
+xpass grep -q "qux" $OUTFILE
+
 echo "=== expected failure cases ==="
 for i in samples/bad-* ; do
   xfail $BOOTCONF -a $i $INITRD

From a5ae50dea9111db63d30d700766dd5509602f7ad Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 20 Feb 2020 13:29:49 +0000
Subject: [PATCH 139/239] Btrfs: fix deadlock during fast fsync when logging
 prealloc extents beyond eof

While logging the prealloc extents of an inode during a fast fsync we call
btrfs_truncate_inode_items(), through btrfs_log_prealloc_extents(), while
holding a read lock on a leaf of the inode's root (not the log root, the
fs/subvol root), and then that function locks the file range in the inode's
iotree. This can lead to a deadlock when:

* the fsync is ranged

* the file has prealloc extents beyond eof

* writeback for a range different from the fsync range starts
  during the fsync

* the size of the file is not sector size aligned

Because when finishing an ordered extent we lock first a file range and
then try to COW the fs/subvol tree to insert an extent item.

The following diagram shows how the deadlock can happen.

           CPU 1                                        CPU 2

  btrfs_sync_file()
    --> for range [0, 1MiB)

    --> inode has a size of
        1MiB and has 1 prealloc
        extent beyond the
        i_size, starting at offset
        4MiB

    flushes all delalloc for the
    range [0MiB, 1MiB) and waits
    for the respective ordered
    extents to complete

                                              --> before task at CPU 1 locks the
                                                  inode, a write into file range
                                                  [1MiB, 2MiB + 1KiB) is made

                                              --> i_size is updated to 2MiB + 1KiB

                                              --> writeback is started for that
                                                  range, [1MiB, 2MiB + 4KiB)
                                                  --> end offset rounded up to
                                                      be sector size aligned

    btrfs_log_dentry_safe()
      btrfs_log_inode_parent()
        btrfs_log_inode()

          btrfs_log_changed_extents()
            btrfs_log_prealloc_extents()
              --> does a search on the
                  inode's root
              --> holds a read lock on
                  leaf X

                                              btrfs_finish_ordered_io()
                                                --> locks range [1MiB, 2MiB + 4KiB)
                                                    --> end offset rounded up
                                                        to be sector size aligned

                                                --> tries to cow leaf X, through
                                                    insert_reserved_file_extent()
                                                    --> already locked by the
                                                        task at CPU 1

              btrfs_truncate_inode_items()

                --> gets an i_size of
                    2MiB + 1KiB, which is
                    not sector size
                    aligned

                --> tries to lock file
                    range [2MiB, (u64)-1)
                    --> the start range
                        is rounded down
                        from 2MiB + 1K
                        to 2MiB to be sector
                        size aligned

                    --> but the subrange
                        [2MiB, 2MiB + 4KiB) is
                        already locked by
                        task at CPU 2 which
                        is waiting to get a
                        write lock on leaf X
                        for which we are
                        holding a read lock

                                *** deadlock ***

This results in a stack trace like the following, triggered by test case
generic/561 from fstests:

  [ 2779.973608] INFO: task kworker/u8:6:247 blocked for more than 120 seconds.
  [ 2779.979536]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
  [ 2779.984503] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 2779.990136] kworker/u8:6    D    0   247      2 0x80004000
  [ 2779.990457] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
  [ 2779.990466] Call Trace:
  [ 2779.990491]  ? __schedule+0x384/0xa30
  [ 2779.990521]  schedule+0x33/0xe0
  [ 2779.990616]  btrfs_tree_read_lock+0x19e/0x2e0 [btrfs]
  [ 2779.990632]  ? remove_wait_queue+0x60/0x60
  [ 2779.990730]  btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
  [ 2779.990782]  btrfs_search_slot+0x510/0x1000 [btrfs]
  [ 2779.990869]  btrfs_lookup_file_extent+0x4a/0x70 [btrfs]
  [ 2779.990944]  __btrfs_drop_extents+0x161/0x1060 [btrfs]
  [ 2779.990987]  ? mark_held_locks+0x6d/0xc0
  [ 2779.990994]  ? __slab_alloc.isra.49+0x99/0x100
  [ 2779.991060]  ? insert_reserved_file_extent.constprop.19+0x64/0x300 [btrfs]
  [ 2779.991145]  insert_reserved_file_extent.constprop.19+0x97/0x300 [btrfs]
  [ 2779.991222]  ? start_transaction+0xdd/0x5c0 [btrfs]
  [ 2779.991291]  btrfs_finish_ordered_io+0x4f4/0x840 [btrfs]
  [ 2779.991405]  btrfs_work_helper+0xaa/0x720 [btrfs]
  [ 2779.991432]  process_one_work+0x26d/0x6a0
  [ 2779.991460]  worker_thread+0x4f/0x3e0
  [ 2779.991481]  ? process_one_work+0x6a0/0x6a0
  [ 2779.991489]  kthread+0x103/0x140
  [ 2779.991499]  ? kthread_create_worker_on_cpu+0x70/0x70
  [ 2779.991515]  ret_from_fork+0x3a/0x50
  (...)
  [ 2780.026211] INFO: task fsstress:17375 blocked for more than 120 seconds.
  [ 2780.027480]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
  [ 2780.028482] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 2780.030035] fsstress        D    0 17375  17373 0x00004000
  [ 2780.030038] Call Trace:
  [ 2780.030044]  ? __schedule+0x384/0xa30
  [ 2780.030052]  schedule+0x33/0xe0
  [ 2780.030075]  lock_extent_bits+0x20c/0x320 [btrfs]
  [ 2780.030094]  ? btrfs_truncate_inode_items+0xf4/0x1150 [btrfs]
  [ 2780.030098]  ? rcu_read_lock_sched_held+0x59/0xa0
  [ 2780.030102]  ? remove_wait_queue+0x60/0x60
  [ 2780.030122]  btrfs_truncate_inode_items+0x133/0x1150 [btrfs]
  [ 2780.030151]  ? btrfs_set_path_blocking+0xb2/0x160 [btrfs]
  [ 2780.030165]  ? btrfs_search_slot+0x379/0x1000 [btrfs]
  [ 2780.030195]  btrfs_log_changed_extents.isra.8+0x841/0x93e [btrfs]
  [ 2780.030202]  ? do_raw_spin_unlock+0x49/0xc0
  [ 2780.030215]  ? btrfs_get_num_csums+0x10/0x10 [btrfs]
  [ 2780.030239]  btrfs_log_inode+0xf83/0x1124 [btrfs]
  [ 2780.030251]  ? __mutex_unlock_slowpath+0x45/0x2a0
  [ 2780.030275]  btrfs_log_inode_parent+0x2a0/0xe40 [btrfs]
  [ 2780.030282]  ? dget_parent+0xa1/0x370
  [ 2780.030309]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
  [ 2780.030329]  btrfs_sync_file+0x3f3/0x490 [btrfs]
  [ 2780.030339]  do_fsync+0x38/0x60
  [ 2780.030343]  __x64_sys_fdatasync+0x13/0x20
  [ 2780.030345]  do_syscall_64+0x5c/0x280
  [ 2780.030348]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [ 2780.030356] RIP: 0033:0x7f2d80f6d5f0
  [ 2780.030361] Code: Bad RIP value.
  [ 2780.030362] RSP: 002b:00007ffdba3c8548 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
  [ 2780.030364] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f2d80f6d5f0
  [ 2780.030365] RDX: 00007ffdba3c84b0 RSI: 00007ffdba3c84b0 RDI: 0000000000000003
  [ 2780.030367] RBP: 000000000000004a R08: 0000000000000001 R09: 00007ffdba3c855c
  [ 2780.030368] R10: 0000000000000078 R11: 0000000000000246 R12: 00000000000001f4
  [ 2780.030369] R13: 0000000051eb851f R14: 00007ffdba3c85f0 R15: 0000557a49220d90

So fix this by making btrfs_truncate_inode_items() not lock the range in
the inode's iotree when the target root is a log root, since it's not
needed to lock the range for log roots as the protection from the inode's
lock and log_mutex are all that's needed.

Fixes: 28553fa992cb28 ("Btrfs: fix race between shrinking truncate and fiemap")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f47ba652b31..1ccb3f8d528d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4103,8 +4103,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	path->reada = READA_BACK;
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
-			 &cached_state);
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+				 &cached_state);
 
 	/*
 	 * We want to drop from the next block forward in case this new size is
@@ -4368,11 +4369,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		if (!ret && last_size > new_size)
 			last_size = new_size;
 		btrfs_ordered_update_i_size(inode, last_size, NULL);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+				     (u64)-1, &cached_state);
 	}
 
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
-			     &cached_state);
-
 	btrfs_free_path(path);
 	return ret;
 }

From 7143b5ac5750f404ff3a594b34fdf3fc2f99f828 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Fri, 21 Feb 2020 16:42:16 +0100
Subject: [PATCH 140/239] io_uring: prevent sq_thread from spinning when it
 should stop

This patch drops 'cur_mm' before calling cond_resched(), to prevent
the sq_thread from spinning even when the user process is finished.

Before this patch, if the user process ended without closing the
io_uring fd, the sq_thread continues to spin until the
'sq_thread_idle' timeout ends.

In the worst case where the 'sq_thread_idle' parameter is bigger than
INT_MAX, the sq_thread will spin forever.

Fixes: 6c271ce2f1d5 ("io_uring: add submission polling")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e249aa97ba3..b43467b3a8dc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5142,6 +5142,18 @@ static int io_sq_thread(void *data)
 		 * to enter the kernel to reap and flush events.
 		 */
 		if (!to_submit || ret == -EBUSY) {
+			/*
+			 * Drop cur_mm before scheduling, we can't hold it for
+			 * long periods (or over schedule()). Do this before
+			 * adding ourselves to the waitqueue, as the unuse/drop
+			 * may sleep.
+			 */
+			if (cur_mm) {
+				unuse_mm(cur_mm);
+				mmput(cur_mm);
+				cur_mm = NULL;
+			}
+
 			/*
 			 * We're polling. If we're within the defined idle
 			 * period, then let us spin without work before going
@@ -5156,18 +5168,6 @@ static int io_sq_thread(void *data)
 				continue;
 			}
 
-			/*
-			 * Drop cur_mm before scheduling, we can't hold it for
-			 * long periods (or over schedule()). Do this before
-			 * adding ourselves to the waitqueue, as the unuse/drop
-			 * may sleep.
-			 */
-			if (cur_mm) {
-				unuse_mm(cur_mm);
-				mmput(cur_mm);
-				cur_mm = NULL;
-			}
-
 			prepare_to_wait(&ctx->sqo_wait, &wait,
 						TASK_INTERRUPTIBLE);
 

From 7455a8327674e1a7c9a1f5dd1b0743ab6713f6d1 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 14 Feb 2020 10:32:38 +0800
Subject: [PATCH 141/239] KVM: x86: don't notify userspace IOAPIC on
 edge-triggered interrupt EOI

Commit 13db77347db1 ("KVM: x86: don't notify userspace IOAPIC on edge
EOI") said, edge-triggered interrupts don't set a bit in TMR, which means
that IOAPIC isn't notified on EOI. And var level indicates level-triggered
interrupt.
But commit 3159d36ad799 ("KVM: x86: use generic function for MSI parsing")
replace var level with irq.level by mistake. Fix it by changing irq.level
to irq.trig_mode.

Cc: stable@vger.kernel.org
Fixes: 3159d36ad799 ("KVM: x86: use generic function for MSI parsing")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 79afa0bb5f41..c47d2acec529 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -417,7 +417,7 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
 
 			kvm_set_msi_irq(vcpu->kvm, entry, &irq);
 
-			if (irq.level &&
+			if (irq.trig_mode &&
 			    kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
 						irq.dest_id, irq.dest_mode))
 				__set_bit(irq.vector, ioapic_handled_vectors);

From c9dfd3fb08352d439f0399b6fabe697681d2638c Mon Sep 17 00:00:00 2001
From: wanpeng li <wanpengli@tencent.com>
Date: Mon, 17 Feb 2020 18:37:43 +0800
Subject: [PATCH 142/239] KVM: nVMX: Hold KVM's srcu lock when syncing
 vmcs12->shadow

For the duration of mapping eVMCS, it derefences ->memslots without holding
->srcu or ->slots_lock when accessing hv assist page. This patch fixes it by
moving nested_sync_vmcs12_to_shadow to prepare_guest_switch, where the SRCU
is already taken.

It can be reproduced by running kvm's evmcs_test selftest.

  =============================
  warning: suspicious rcu usage
  5.6.0-rc1+ #53 tainted: g        w ioe
  -----------------------------
  ./include/linux/kvm_host.h:623 suspicious rcu_dereference_check() usage!

  other info that might help us debug this:

   rcu_scheduler_active = 2, debug_locks = 1
  1 lock held by evmcs_test/8507:
   #0: ffff9ddd156d00d0 (&vcpu->mutex){+.+.}, at:
kvm_vcpu_ioctl+0x85/0x680 [kvm]

  stack backtrace:
  cpu: 6 pid: 8507 comm: evmcs_test tainted: g        w ioe     5.6.0-rc1+ #53
  hardware name: dell inc. optiplex 7040/0jctf8, bios 1.4.9 09/12/2016
  call trace:
   dump_stack+0x68/0x9b
   kvm_read_guest_cached+0x11d/0x150 [kvm]
   kvm_hv_get_assist_page+0x33/0x40 [kvm]
   nested_enlightened_vmentry+0x2c/0x60 [kvm_intel]
   nested_vmx_handle_enlightened_vmptrld.part.52+0x32/0x1c0 [kvm_intel]
   nested_sync_vmcs12_to_shadow+0x439/0x680 [kvm_intel]
   vmx_vcpu_run+0x67a/0xe60 [kvm_intel]
   vcpu_enter_guest+0x35e/0x1bc0 [kvm]
   kvm_arch_vcpu_ioctl_run+0x40b/0x670 [kvm]
   kvm_vcpu_ioctl+0x370/0x680 [kvm]
   ksys_ioctl+0x235/0x850
   __x64_sys_ioctl+0x16/0x20
   do_syscall_64+0x77/0x780
   entry_syscall_64_after_hwframe+0x49/0xbe

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3be25ecae145..dafe4df893c8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1175,6 +1175,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
 					   vmx->guest_msrs[i].mask);
 
 	}
+
+    	if (vmx->nested.need_vmcs12_to_shadow_sync)
+		nested_sync_vmcs12_to_shadow(vcpu);
+
 	if (vmx->guest_state_loaded)
 		return;
 
@@ -6482,8 +6486,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		vmcs_write32(PLE_WINDOW, vmx->ple_window);
 	}
 
-	if (vmx->nested.need_vmcs12_to_shadow_sync)
-		nested_sync_vmcs12_to_shadow(vcpu);
+	/*
+	 * We did this in prepare_switch_to_guest, because it needs to
+	 * be within srcu_read_lock.
+	 */
+	WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
 
 	if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
 		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);

From 624e18f92f2ec449c2b3a6d388c0dccc45e4c4d7 Mon Sep 17 00:00:00 2001
From: Xiaoyao Li <xiaoyao.li@intel.com>
Date: Sun, 16 Feb 2020 18:48:57 +0800
Subject: [PATCH 143/239] KVM: VMX: Add VMX_FEATURE_USR_WAIT_PAUSE

Commit 159348784ff0 ("x86/vmx: Introduce VMX_FEATURES_*") missed
bit 26 (enable user wait and pause) of Secondary Processor-based
VM-Execution Controls.

Add VMX_FEATURE_USR_WAIT_PAUSE flag so that it shows up in /proc/cpuinfo,
and use it to define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE to make them
uniform.

Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h         | 2 +-
 arch/x86/include/asm/vmxfeatures.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2a85287b3685..8521af3fef27 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,7 @@
 #define SECONDARY_EXEC_MODE_BASED_EPT_EXEC	VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
 #define SECONDARY_EXEC_PT_USE_GPA		VMCS_CONTROL_BIT(PT_USE_GPA)
 #define SECONDARY_EXEC_TSC_SCALING              VMCS_CONTROL_BIT(TSC_SCALING)
-#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	0x04000000
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE	VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
 
 #define PIN_BASED_EXT_INTR_MASK                 VMCS_CONTROL_BIT(INTR_EXITING)
 #define PIN_BASED_NMI_EXITING                   VMCS_CONTROL_BIT(NMI_EXITING)
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index a50e4a0de315..9915990fd8cf 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -81,6 +81,7 @@
 #define VMX_FEATURE_MODE_BASED_EPT_EXEC	( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */
 #define VMX_FEATURE_PT_USE_GPA		( 2*32+ 24) /* "" Processor Trace logs GPAs */
 #define VMX_FEATURE_TSC_SCALING		( 2*32+ 25) /* Scale hardware TSC when read in guest */
+#define VMX_FEATURE_USR_WAIT_PAUSE	( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
 #define VMX_FEATURE_ENCLV_EXITING	( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
 
 #endif /* _ASM_X86_VMXFEATURES_H */

From 93fd9666c269877fffb74e14f52792d9c000c1f2 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri, 21 Feb 2020 08:52:17 -0600
Subject: [PATCH 144/239] kvm: x86: svm: Fix NULL pointer dereference when AVIC
 not enabled

Launching VM w/ AVIC disabled together with pass-through device
results in NULL pointer dereference bug with the following call trace.

    RIP: 0010:svm_refresh_apicv_exec_ctrl+0x17e/0x1a0 [kvm_amd]

    Call Trace:
     kvm_vcpu_update_apicv+0x44/0x60 [kvm]
     kvm_arch_vcpu_ioctl_run+0x3f4/0x1c80 [kvm]
     kvm_vcpu_ioctl+0x3d8/0x650 [kvm]
     do_vfs_ioctl+0xaa/0x660
     ? tomoyo_file_ioctl+0x19/0x20
     ksys_ioctl+0x67/0x90
     __x64_sys_ioctl+0x1a/0x20
     do_syscall_64+0x57/0x190
     entry_SYSCALL_64_after_hwframe+0x44/0xa9

Investigation shows that this is due to the uninitialized usage of
struct vapu_svm.ir_list in the svm_set_pi_irte_mode(), which is
called from svm_refresh_apicv_exec_ctrl().

The ir_list is initialized only if AVIC is enabled. So, fixes by
adding a check if AVIC is enabled in the svm_refresh_apicv_exec_ctrl().

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206579
Fixes: 8937d762396d ("kvm: x86: svm: Add support to (de)activate posted interrupts.")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bef0ba35f121..a391b29138f0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5232,6 +5232,9 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	struct vmcb *vmcb = svm->vmcb;
 	bool activated = kvm_vcpu_apicv_active(vcpu);
 
+	if (!avic)
+		return;
+
 	if (activated) {
 		/**
 		 * During AVIC temporary deactivation, guest could update

From 91a5f413af596ad01097e59bf487eb07cb3f1331 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 20 Feb 2020 18:22:05 +0100
Subject: [PATCH 145/239] KVM: nVMX: handle nested posted interrupts when apicv
 is disabled for L1

Even when APICv is disabled for L1 it can (and, actually, is) still
available for L2, this means we need to always call
vmx_deliver_nested_posted_interrupt() when attempting an interrupt
delivery.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/lapic.c            |  5 +----
 arch/x86/kvm/svm.c              |  7 ++++++-
 arch/x86/kvm/vmx/vmx.c          | 13 +++++++++----
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 40a0c0fd95ca..a84e8c5acda8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1146,7 +1146,7 @@ struct kvm_x86_ops {
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
-	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+	int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afcd30d44cbb..cc8ee8125712 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1046,11 +1046,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 						       apic->regs + APIC_TMR);
 		}
 
-		if (vcpu->arch.apicv_active)
-			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
-		else {
+		if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
 			kvm_lapic_set_irr(vector, apic);
-
 			kvm_make_request(KVM_REQ_EVENT, vcpu);
 			kvm_vcpu_kick(vcpu);
 		}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a391b29138f0..8787a123b8e7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5258,8 +5258,11 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	return;
 }
 
-static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 {
+	if (!vcpu->arch.apicv_active)
+		return -1;
+
 	kvm_lapic_set_irr(vec, vcpu->arch.apic);
 	smp_mb__after_atomic();
 
@@ -5271,6 +5274,8 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 		put_cpu();
 	} else
 		kvm_vcpu_wake_up(vcpu);
+
+	return 0;
 }
 
 static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index dafe4df893c8..63ccc435a602 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3822,24 +3822,29 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
  * 2. If target vcpu isn't running(root mode), kick it to pick up the
  * interrupt from PIR in next vmentry.
  */
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int r;
 
 	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
 	if (!r)
-		return;
+		return 0;
+
+	if (!vcpu->arch.apicv_active)
+		return -1;
 
 	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
-		return;
+		return 0;
 
 	/* If a previous notification has sent the IPI, nothing to do.  */
 	if (pi_test_and_set_on(&vmx->pi_desc))
-		return;
+		return 0;
 
 	if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
 		kvm_vcpu_kick(vcpu);
+
+	return 0;
 }
 
 /*

From a4443267800af240072280c44521caab61924e55 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 20 Feb 2020 18:22:04 +0100
Subject: [PATCH 146/239] KVM: nVMX: clear PIN_BASED_POSTED_INTR from nested
 pinbased_ctls only when apicv is globally disabled

When apicv is disabled on a vCPU (e.g. by enabling KVM_CAP_HYPERV_SYNIC*),
nothing happens to VMX MSRs on the already existing vCPUs, however, all new
ones are created with PIN_BASED_POSTED_INTR filtered out. This is very
confusing and results in the following picture inside the guest:

$ rdmsr -ax 0x48d
ff00000016
7f00000016
7f00000016
7f00000016

This is observed with QEMU and 4-vCPU guest: QEMU creates vCPU0, does
KVM_CAP_HYPERV_SYNIC2 and then creates the remaining three.

L1 hypervisor may only check CPU0's controls to find out what features
are available and it will be very confused later. Switch to setting
PIN_BASED_POSTED_INTR control based on global 'enable_apicv' setting.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/capabilities.h |  1 +
 arch/x86/kvm/vmx/nested.c       |  5 ++---
 arch/x86/kvm/vmx/nested.h       |  3 +--
 arch/x86/kvm/vmx/vmx.c          | 10 ++++------
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 283bdb7071af..f486e2606247 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept;
 extern bool __read_mostly enable_unrestricted_guest;
 extern bool __read_mostly enable_ept_ad_bits;
 extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_apicv;
 extern int __read_mostly pt_mode;
 
 #define PT_MODE_SYSTEM		0
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a5757b0b80f9..2b3ba7d27be4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5949,8 +5949,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
  * bit in the high half is on if the corresponding bit in the control field
  * may be on. See also vmx_control_verify().
  */
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
-				bool apicv)
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
 {
 	/*
 	 * Note that as a general rule, the high half of the MSRs (bits in
@@ -5977,7 +5976,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
 		PIN_BASED_EXT_INTR_MASK |
 		PIN_BASED_NMI_EXITING |
 		PIN_BASED_VIRTUAL_NMIS |
-		(apicv ? PIN_BASED_POSTED_INTR : 0);
+		(enable_apicv ? PIN_BASED_POSTED_INTR : 0);
 	msrs->pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index fc874d4ead0f..1c5fbff45d69 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -17,8 +17,7 @@ enum nvmx_vmentry_status {
 };
 
 void vmx_leave_nested(struct kvm_vcpu *vcpu);
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
-				bool apicv);
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
 void nested_vmx_hardware_unsetup(void);
 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
 void nested_vmx_set_vmcs_shadowing_bitmap(void);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63ccc435a602..404dafedd778 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -95,7 +95,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv = 1;
+bool __read_mostly enable_apicv = 1;
 module_param(enable_apicv, bool, S_IRUGO);
 
 /*
@@ -6769,8 +6769,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
 
 	if (nested)
 		nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
-					   vmx_capability.ept,
-					   kvm_vcpu_apicv_active(vcpu));
+					   vmx_capability.ept);
 	else
 		memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
 
@@ -6851,8 +6850,7 @@ static int __init vmx_check_processor_compat(void)
 	if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
 		return -EIO;
 	if (nested)
-		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
-					   enable_apicv);
+		nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
 	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
 		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
 				smp_processor_id());
@@ -7714,7 +7712,7 @@ static __init int hardware_setup(void)
 
 	if (nested) {
 		nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
-					   vmx_capability.ept, enable_apicv);
+					   vmx_capability.ept);
 
 		r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
 		if (r)

From 23520b2def95205f132e167cf5b25c609975e959 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Fri, 21 Feb 2020 22:04:46 +0800
Subject: [PATCH 147/239] KVM: apic: avoid calculating pending eoi from an
 uninitialized val

When pv_eoi_get_user() fails, 'val' may remain uninitialized and the return
value of pv_eoi_get_pending() becomes random. Fix the issue by initializing
the variable.

Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cc8ee8125712..e3099c642fec 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -627,9 +627,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
 static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
 {
 	u8 val;
-	if (pv_eoi_get_user(vcpu, &val) < 0)
+	if (pv_eoi_get_user(vcpu, &val) < 0) {
 		printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
 			   (unsigned long long)vcpu->arch.pv_eoi.msr_val);
+		return false;
+	}
 	return val & 0x1;
 }
 

From d80b64ff297e40c2b6f7d7abc1b3eba70d22a068 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Sat, 4 Jan 2020 16:56:49 +0800
Subject: [PATCH 148/239] KVM: SVM: Fix potential memory leak in svm_cpu_init()

When kmalloc memory for sd->sev_vmcbs failed, we forget to free the page
held by sd->save_area. Also get rid of the var r as '-ENOMEM' is actually
the only possible outcome here.

Reviewed-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8787a123b8e7..ff02aeb23616 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1005,33 +1005,32 @@ static void svm_cpu_uninit(int cpu)
 static int svm_cpu_init(int cpu)
 {
 	struct svm_cpu_data *sd;
-	int r;
 
 	sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
 	if (!sd)
 		return -ENOMEM;
 	sd->cpu = cpu;
-	r = -ENOMEM;
 	sd->save_area = alloc_page(GFP_KERNEL);
 	if (!sd->save_area)
-		goto err_1;
+		goto free_cpu_data;
 
 	if (svm_sev_enabled()) {
-		r = -ENOMEM;
 		sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
 					      sizeof(void *),
 					      GFP_KERNEL);
 		if (!sd->sev_vmcbs)
-			goto err_1;
+			goto free_save_area;
 	}
 
 	per_cpu(svm_data, cpu) = sd;
 
 	return 0;
 
-err_1:
+free_save_area:
+	__free_page(sd->save_area);
+free_cpu_data:
 	kfree(sd);
-	return r;
+	return -ENOMEM;
 
 }
 

From e61d2392253220477813b43412090dccdcac601f Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 20 Feb 2020 06:29:48 -0800
Subject: [PATCH 149/239] hwmon: (w83627ehf) Fix crash seen with W83627DHG-P

Loading the driver on a system with W83627DHG-P crashes as follows.

w83627ehf: Found W83627DHG-P chip at 0x290
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD 0 P4D 0
Oops: 0000 [#1] SMP NOPTI
CPU: 0 PID: 604 Comm: sensors Not tainted 5.6.0-rc2-00055-gca7e1fd1026c #29
Hardware name:  /D425KT, BIOS MWPNT10N.86A.0132.2013.0726.1534 07/26/2013
RIP: 0010:w83627ehf_read_string+0x27/0x70 [w83627ehf]
Code: [... ]
RSP: 0018:ffffb95980657df8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: ffff96caaa7f5218 RCX: 0000000000000000
RDX: 0000000000000015 RSI: 0000000000000001 RDI: ffff96caa736ec08
RBP: 0000000000000000 R08: ffffb95980657e20 R09: 0000000000000001
R10: ffff96caaa635cc0 R11: 0000000000000000 R12: ffff96caa9f7cf00
R13: ffff96caa9ec3d00 R14: ffff96caa9ec3d28 R15: ffff96caa9ec3d40
FS:  00007fbc7c4e2740(0000) GS:ffff96caabc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000000 CR3: 0000000129d58000 CR4: 00000000000006f0
Call Trace:
 ? cp_new_stat+0x12d/0x160
 hwmon_attr_show_string+0x37/0x70 [hwmon]
 dev_attr_show+0x14/0x50
 sysfs_kf_seq_show+0xb5/0x1b0
 seq_read+0xcf/0x460
 vfs_read+0x9b/0x150
 ksys_read+0x5f/0xe0
 do_syscall_64+0x48/0x190
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
...

Temperature labels are not always present. Adjust sysfs attribute
visibility accordingly.

Reported-by: Meelis Roos <mroos@linux.ee>
Suggested-by: Dr. David Alan Gilbert <linux@treblig.org>
Reviewed-by: Dr. David Alan Gilbert <linux@treblig.org>
Cc: Meelis Roos <mroos@linux.ee>
Cc: Dr. David Alan Gilbert <linux@treblig.org>
Fixes: 266cd5835947 ("hwmon: (w83627ehf) convert to with_info interface")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/w83627ehf.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/hwmon/w83627ehf.c b/drivers/hwmon/w83627ehf.c
index 7ffadc2da57b..5a5120121e50 100644
--- a/drivers/hwmon/w83627ehf.c
+++ b/drivers/hwmon/w83627ehf.c
@@ -1346,8 +1346,13 @@ w83627ehf_is_visible(const void *drvdata, enum hwmon_sensor_types type,
 		/* channel 0.., name 1.. */
 		if (!(data->have_temp & (1 << channel)))
 			return 0;
-		if (attr == hwmon_temp_input || attr == hwmon_temp_label)
+		if (attr == hwmon_temp_input)
 			return 0444;
+		if (attr == hwmon_temp_label) {
+			if (data->temp_label)
+				return 0444;
+			return 0;
+		}
 		if (channel == 2 && data->temp3_val_only)
 			return 0;
 		if (attr == hwmon_temp_max) {

From 7c990728b99ed6fbe9c75fc202fce1172d9916da Mon Sep 17 00:00:00 2001
From: Suraj Jitindar Singh <surajjs@amazon.com>
Date: Tue, 18 Feb 2020 19:08:51 -0800
Subject: [PATCH 150/239] ext4: fix potential race between s_flex_groups online
 resizing and access

During an online resize an array of s_flex_groups structures gets replaced
so it can get enlarged. If there is a concurrent access to the array and
this memory has been reused then this can lead to an invalid memory access.

The s_flex_group array has been converted into an array of pointers rather
than an array of structures. This is to ensure that the information
contained in the structures cannot get out of sync during a resize due to
an accessor updating the value in the old structure after it has been
copied but before the array pointer is updated. Since the structures them-
selves are no longer copied but only the pointers to them this case is
mitigated.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443
Link: https://lore.kernel.org/r/20200221053458.730016-4-tytso@mit.edu
Signed-off-by: Suraj Jitindar Singh <surajjs@amazon.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/ialloc.c  | 23 +++++++++------
 fs/ext4/mballoc.c |  9 ++++--
 fs/ext4/resize.c  |  7 +++--
 fs/ext4/super.c   | 72 ++++++++++++++++++++++++++++++++---------------
 5 files changed, 76 insertions(+), 37 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b1ece5329738..614fefa7dc7a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1512,7 +1512,7 @@ struct ext4_sb_info {
 	unsigned int s_extent_max_zeroout_kb;
 
 	unsigned int s_log_groups_per_flex;
-	struct flex_groups *s_flex_groups;
+	struct flex_groups * __rcu *s_flex_groups;
 	ext4_group_t s_flex_groups_allocated;
 
 	/* workqueue for reserved extent conversions (buffered io) */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c66e8f9451a2..f95ee99091e4 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -328,11 +328,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	percpu_counter_inc(&sbi->s_freeinodes_counter);
 	if (sbi->s_log_groups_per_flex) {
-		ext4_group_t f = ext4_flex_group(sbi, block_group);
+		struct flex_groups *fg;
 
-		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups,
+					 ext4_flex_group(sbi, block_group));
+		atomic_inc(&fg->free_inodes);
 		if (is_directory)
-			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+			atomic_dec(&fg->used_dirs);
 	}
 	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
@@ -368,12 +370,13 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
 			    int flex_size, struct orlov_stats *stats)
 {
 	struct ext4_group_desc *desc;
-	struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
 
 	if (flex_size > 1) {
-		stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
-		stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
-		stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+		struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
+							     s_flex_groups, g);
+		stats->free_inodes = atomic_read(&fg->free_inodes);
+		stats->free_clusters = atomic64_read(&fg->free_clusters);
+		stats->used_dirs = atomic_read(&fg->used_dirs);
 		return;
 	}
 
@@ -1054,7 +1057,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		if (sbi->s_log_groups_per_flex) {
 			ext4_group_t f = ext4_flex_group(sbi, group);
 
-			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+							f)->used_dirs);
 		}
 	}
 	if (ext4_has_group_desc_csum(sb)) {
@@ -1077,7 +1081,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 
 	if (sbi->s_log_groups_per_flex) {
 		flex_group = ext4_flex_group(sbi, group);
-		atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+		atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
+						flex_group)->free_inodes);
 	}
 
 	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1b46fb63692a..51a78eb65f3c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3038,7 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		ext4_group_t flex_group = ext4_flex_group(sbi,
 							  ac->ac_b_ex.fe_group);
 		atomic64_sub(ac->ac_b_ex.fe_len,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4936,7 +4937,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		atomic64_add(count_clusters,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	/*
@@ -5093,7 +5095,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		atomic64_add(clusters_freed,
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &sbi_array_rcu_deref(sbi, s_flex_groups,
+						  flex_group)->free_clusters);
 	}
 
 	ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 536cc9f38091..a50b51270ea9 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1430,11 +1430,14 @@ static void ext4_update_super(struct super_block *sb,
 		   percpu_counter_read(&sbi->s_freeclusters_counter));
 	if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
+		struct flex_groups *fg;
+
 		flex_group = ext4_flex_group(sbi, group_data[0].group);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
 		atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
-			     &sbi->s_flex_groups[flex_group].free_clusters);
+			     &fg->free_clusters);
 		atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+			   &fg->free_inodes);
 	}
 
 	/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e00bcc19099f..6b7e628b7903 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1015,6 +1015,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	struct buffer_head **group_desc;
+	struct flex_groups **flex_groups;
 	int aborted = 0;
 	int i, err;
 
@@ -1052,8 +1053,13 @@ static void ext4_put_super(struct super_block *sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(group_desc[i]);
 	kvfree(group_desc);
+	flex_groups = rcu_dereference(sbi->s_flex_groups);
+	if (flex_groups) {
+		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+			kvfree(flex_groups[i]);
+		kvfree(flex_groups);
+	}
 	rcu_read_unlock();
-	kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2384,8 +2390,8 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct flex_groups *new_groups;
-	int size;
+	struct flex_groups **old_groups, **new_groups;
+	int size, i;
 
 	if (!sbi->s_log_groups_per_flex)
 		return 0;
@@ -2394,22 +2400,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
 	if (size <= sbi->s_flex_groups_allocated)
 		return 0;
 
-	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
-	new_groups = kvzalloc(size, GFP_KERNEL);
+	new_groups = kvzalloc(roundup_pow_of_two(size *
+			      sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
 	if (!new_groups) {
-		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
-			 size / (int) sizeof(struct flex_groups));
+		ext4_msg(sb, KERN_ERR,
+			 "not enough memory for %d flex group pointers", size);
 		return -ENOMEM;
 	}
-
-	if (sbi->s_flex_groups) {
-		memcpy(new_groups, sbi->s_flex_groups,
-		       (sbi->s_flex_groups_allocated *
-			sizeof(struct flex_groups)));
-		kvfree(sbi->s_flex_groups);
+	for (i = sbi->s_flex_groups_allocated; i < size; i++) {
+		new_groups[i] = kvzalloc(roundup_pow_of_two(
+					 sizeof(struct flex_groups)),
+					 GFP_KERNEL);
+		if (!new_groups[i]) {
+			for (i--; i >= sbi->s_flex_groups_allocated; i--)
+				kvfree(new_groups[i]);
+			kvfree(new_groups);
+			ext4_msg(sb, KERN_ERR,
+				 "not enough memory for %d flex groups", size);
+			return -ENOMEM;
+		}
 	}
-	sbi->s_flex_groups = new_groups;
-	sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+	rcu_read_lock();
+	old_groups = rcu_dereference(sbi->s_flex_groups);
+	if (old_groups)
+		memcpy(new_groups, old_groups,
+		       (sbi->s_flex_groups_allocated *
+			sizeof(struct flex_groups *)));
+	rcu_read_unlock();
+	rcu_assign_pointer(sbi->s_flex_groups, new_groups);
+	sbi->s_flex_groups_allocated = size;
+	if (old_groups)
+		ext4_kvfree_array_rcu(old_groups);
 	return 0;
 }
 
@@ -2417,6 +2438,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
+	struct flex_groups *fg;
 	ext4_group_t flex_group;
 	int i, err;
 
@@ -2434,12 +2456,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
-		atomic_add(ext4_free_inodes_count(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].free_inodes);
+		fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+		atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
 		atomic64_add(ext4_free_group_clusters(sb, gdp),
-			     &sbi->s_flex_groups[flex_group].free_clusters);
-		atomic_add(ext4_used_dirs_count(sb, gdp),
-			   &sbi->s_flex_groups[flex_group].used_dirs);
+			     &fg->free_clusters);
+		atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
 	}
 
 	return 1;
@@ -3641,6 +3662,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	struct buffer_head *bh, **group_desc;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	struct flex_groups **flex_groups;
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
@@ -4692,8 +4714,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_unregister_li_request(sb);
 failed_mount6:
 	ext4_mb_release(sb);
-	if (sbi->s_flex_groups)
-		kvfree(sbi->s_flex_groups);
+	rcu_read_lock();
+	flex_groups = rcu_dereference(sbi->s_flex_groups);
+	if (flex_groups) {
+		for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+			kvfree(flex_groups[i]);
+		kvfree(flex_groups);
+	}
+	rcu_read_unlock();
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);

From bbd55937de8f2754adc5792b0f8e5ff7d9c0420e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 19 Feb 2020 10:30:46 -0800
Subject: [PATCH 151/239] ext4: rename s_journal_flag_rwsem to
 s_writepages_rwsem

In preparation for making s_journal_flag_rwsem synchronize
ext4_writepages() with changes to both the EXTENTS and JOURNAL_DATA
flags (rather than just JOURNAL_DATA as it does currently), rename it to
s_writepages_rwsem.

Link: https://lore.kernel.org/r/20200219183047.47417-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/inode.c | 14 +++++++-------
 fs/ext4/super.c |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 614fefa7dc7a..4b986ad42b9d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1553,7 +1553,7 @@ struct ext4_sb_info {
 	struct ratelimit_state s_msg_ratelimit_state;
 
 	/* Barrier between changing inodes' journal flags and writepages ops. */
-	struct percpu_rw_semaphore s_journal_flag_rwsem;
+	struct percpu_rw_semaphore s_writepages_rwsem;
 	struct dax_device *s_daxdev;
 #ifdef CONFIG_EXT4_DEBUG
 	unsigned long s_simulate_fail;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6e1d81ed44ad..fa0ff78dc033 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2628,7 +2628,7 @@ static int ext4_writepages(struct address_space *mapping,
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
-	percpu_down_read(&sbi->s_journal_flag_rwsem);
+	percpu_down_read(&sbi->s_writepages_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
 	/*
@@ -2849,7 +2849,7 @@ static int ext4_writepages(struct address_space *mapping,
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
-	percpu_up_read(&sbi->s_journal_flag_rwsem);
+	percpu_up_read(&sbi->s_writepages_rwsem);
 	return ret;
 }
 
@@ -2864,13 +2864,13 @@ static int ext4_dax_writepages(struct address_space *mapping,
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;
 
-	percpu_down_read(&sbi->s_journal_flag_rwsem);
+	percpu_down_read(&sbi->s_writepages_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
 	ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
-	percpu_up_read(&sbi->s_journal_flag_rwsem);
+	percpu_up_read(&sbi->s_writepages_rwsem);
 	return ret;
 }
 
@@ -5861,7 +5861,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		}
 	}
 
-	percpu_down_write(&sbi->s_journal_flag_rwsem);
+	percpu_down_write(&sbi->s_writepages_rwsem);
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -5878,7 +5878,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
-			percpu_up_write(&sbi->s_journal_flag_rwsem);
+			percpu_up_write(&sbi->s_writepages_rwsem);
 			return err;
 		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -5886,7 +5886,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
-	percpu_up_write(&sbi->s_journal_flag_rwsem);
+	percpu_up_write(&sbi->s_writepages_rwsem);
 
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6b7e628b7903..6928fc229799 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1064,7 +1064,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(get_qf_name(sb, sbi, i));
@@ -4626,7 +4626,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
 	if (!err)
-		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+		err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
 
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -4726,7 +4726,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
-	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+	percpu_free_rwsem(&sbi->s_writepages_rwsem);
 failed_mount5:
 	ext4_ext_release(sb);
 	ext4_release_system_zone(sb);

From cb85f4d23f794e24127f3e562cb3b54b0803f456 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 19 Feb 2020 10:30:47 -0800
Subject: [PATCH 152/239] ext4: fix race between writepages and enabling
 EXT4_EXTENTS_FL

If EXT4_EXTENTS_FL is set on an inode while ext4_writepages() is running
on it, the following warning in ext4_add_complete_io() can be hit:

WARNING: CPU: 1 PID: 0 at fs/ext4/page-io.c:234 ext4_put_io_end_defer+0xf0/0x120

Here's a minimal reproducer (not 100% reliable) (root isn't required):

        while true; do
                sync
        done &
        while true; do
                rm -f file
                touch file
                chattr -e file
                echo X >> file
                chattr +e file
        done

The problem is that in ext4_writepages(), ext4_should_dioread_nolock()
(which only returns true on extent-based files) is checked once to set
the number of reserved journal credits, and also again later to select
the flags for ext4_map_blocks() and copy the reserved journal handle to
ext4_io_end::handle.  But if EXT4_EXTENTS_FL is being concurrently set,
the first check can see dioread_nolock disabled while the later one can
see it enabled, causing the reserved handle to unexpectedly be NULL.

Since changing EXT4_EXTENTS_FL is uncommon, and there may be other races
related to doing so as well, fix this by synchronizing changing
EXT4_EXTENTS_FL with ext4_writepages() via the existing
s_writepages_rwsem (previously called s_journal_flag_rwsem).

This was originally reported by syzbot without a reproducer at
https://syzkaller.appspot.com/bug?extid=2202a584a00fffd19fbf,
but now that dioread_nolock is the default I also started seeing this
when running syzkaller locally.

Link: https://lore.kernel.org/r/20200219183047.47417-3-ebiggers@kernel.org
Reported-by: syzbot+2202a584a00fffd19fbf@syzkaller.appspotmail.com
Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h    |  5 ++++-
 fs/ext4/migrate.c | 27 +++++++++++++++++++--------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4b986ad42b9d..61b37a052052 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1552,7 +1552,10 @@ struct ext4_sb_info {
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
 
-	/* Barrier between changing inodes' journal flags and writepages ops. */
+	/*
+	 * Barrier between writepages ops and changing any inode's JOURNAL_DATA
+	 * or EXTENTS flag.
+	 */
 	struct percpu_rw_semaphore s_writepages_rwsem;
 	struct dax_device *s_daxdev;
 #ifdef CONFIG_EXT4_DEBUG
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 89725fa42573..fb6520f37135 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -407,6 +407,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
 
 int ext4_ext_migrate(struct inode *inode)
 {
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	handle_t *handle;
 	int retval = 0, i;
 	__le32 *i_data;
@@ -431,6 +432,8 @@ int ext4_ext_migrate(struct inode *inode)
 		 */
 		return retval;
 
+	percpu_down_write(&sbi->s_writepages_rwsem);
+
 	/*
 	 * Worst case we can touch the allocation bitmaps, a bgd
 	 * block, and a block to link in the orphan list.  We do need
@@ -441,7 +444,7 @@ int ext4_ext_migrate(struct inode *inode)
 
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		return retval;
+		goto out_unlock;
 	}
 	goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
 		EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
@@ -452,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode)
 	if (IS_ERR(tmp_inode)) {
 		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
-		return retval;
+		goto out_unlock;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -494,7 +497,7 @@ int ext4_ext_migrate(struct inode *inode)
 		 */
 		ext4_orphan_del(NULL, tmp_inode);
 		retval = PTR_ERR(handle);
-		goto out;
+		goto out_tmp_inode;
 	}
 
 	ei = EXT4_I(inode);
@@ -576,10 +579,11 @@ int ext4_ext_migrate(struct inode *inode)
 	ext4_ext_tree_init(handle, tmp_inode);
 out_stop:
 	ext4_journal_stop(handle);
-out:
+out_tmp_inode:
 	unlock_new_inode(tmp_inode);
 	iput(tmp_inode);
-
+out_unlock:
+	percpu_up_write(&sbi->s_writepages_rwsem);
 	return retval;
 }
 
@@ -589,7 +593,8 @@ int ext4_ext_migrate(struct inode *inode)
 int ext4_ind_migrate(struct inode *inode)
 {
 	struct ext4_extent_header	*eh;
-	struct ext4_super_block		*es = EXT4_SB(inode->i_sb)->s_es;
+	struct ext4_sb_info		*sbi = EXT4_SB(inode->i_sb);
+	struct ext4_super_block		*es = sbi->s_es;
 	struct ext4_inode_info		*ei = EXT4_I(inode);
 	struct ext4_extent		*ex;
 	unsigned int			i, len;
@@ -613,9 +618,13 @@ int ext4_ind_migrate(struct inode *inode)
 	if (test_opt(inode->i_sb, DELALLOC))
 		ext4_alloc_da_blocks(inode);
 
+	percpu_down_write(&sbi->s_writepages_rwsem);
+
 	handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_unlock;
+	}
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_ext_check_inode(inode);
@@ -650,5 +659,7 @@ int ext4_ind_migrate(struct inode *inode)
 errout:
 	ext4_journal_stop(handle);
 	up_write(&EXT4_I(inode)->i_data_sem);
+out_unlock:
+	percpu_up_write(&sbi->s_writepages_rwsem);
 	return ret;
 }

From 8eedabfd66b68a4623beec0789eac54b8c9d0fb6 Mon Sep 17 00:00:00 2001
From: wangyan <wangyan122@huawei.com>
Date: Thu, 20 Feb 2020 21:46:14 +0800
Subject: [PATCH 153/239] jbd2: fix ocfs2 corrupt when clearing block group
 bits

I found a NULL pointer dereference in ocfs2_block_group_clear_bits().
The running environment:
	kernel version: 4.19
	A cluster with two nodes, 5 luns mounted on two nodes, and do some
	file operations like dd/fallocate/truncate/rm on every lun with storage
	network disconnection.

The fallocate operation on dm-23-45 caused an null pointer dereference.

The information of NULL pointer dereference as follows:
	[577992.878282] JBD2: Error -5 detected when updating journal superblock for dm-23-45.
	[577992.878290] Aborting journal on device dm-23-45.
	...
	[577992.890778] JBD2: Error -5 detected when updating journal superblock for dm-24-46.
	[577992.890908] __journal_remove_journal_head: freeing b_committed_data
	[577992.890916] (fallocate,88392,52):ocfs2_extend_trans:474 ERROR: status = -30
	[577992.890918] __journal_remove_journal_head: freeing b_committed_data
	[577992.890920] (fallocate,88392,52):ocfs2_rotate_tree_right:2500 ERROR: status = -30
	[577992.890922] __journal_remove_journal_head: freeing b_committed_data
	[577992.890924] (fallocate,88392,52):ocfs2_do_insert_extent:4382 ERROR: status = -30
	[577992.890928] (fallocate,88392,52):ocfs2_insert_extent:4842 ERROR: status = -30
	[577992.890928] __journal_remove_journal_head: freeing b_committed_data
	[577992.890930] (fallocate,88392,52):ocfs2_add_clusters_in_btree:4947 ERROR: status = -30
	[577992.890933] __journal_remove_journal_head: freeing b_committed_data
	[577992.890939] __journal_remove_journal_head: freeing b_committed_data
	[577992.890949] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020
	[577992.890950] Mem abort info:
	[577992.890951]   ESR = 0x96000004
	[577992.890952]   Exception class = DABT (current EL), IL = 32 bits
	[577992.890952]   SET = 0, FnV = 0
	[577992.890953]   EA = 0, S1PTW = 0
	[577992.890954] Data abort info:
	[577992.890955]   ISV = 0, ISS = 0x00000004
	[577992.890956]   CM = 0, WnR = 0
	[577992.890958] user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000f8da07a9
	[577992.890960] [0000000000000020] pgd=0000000000000000
	[577992.890964] Internal error: Oops: 96000004 [#1] SMP
	[577992.890965] Process fallocate (pid: 88392, stack limit = 0x00000000013db2fd)
	[577992.890968] CPU: 52 PID: 88392 Comm: fallocate Kdump: loaded Tainted: G        W  OE     4.19.36 #1
	[577992.890969] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 0.98 08/25/2019
	[577992.890971] pstate: 60400009 (nZCv daif +PAN -UAO)
	[577992.891054] pc : _ocfs2_free_suballoc_bits+0x63c/0x968 [ocfs2]
	[577992.891082] lr : _ocfs2_free_suballoc_bits+0x618/0x968 [ocfs2]
	[577992.891084] sp : ffff0000c8e2b810
	[577992.891085] x29: ffff0000c8e2b820 x28: 0000000000000000
	[577992.891087] x27: 00000000000006f3 x26: ffffa07957b02e70
	[577992.891089] x25: ffff807c59d50000 x24: 00000000000006f2
	[577992.891091] x23: 0000000000000001 x22: ffff807bd39abc30
	[577992.891093] x21: ffff0000811d9000 x20: ffffa07535d6a000
	[577992.891097] x19: ffff000001681638 x18: ffffffffffffffff
	[577992.891098] x17: 0000000000000000 x16: ffff000080a03df0
	[577992.891100] x15: ffff0000811d9708 x14: 203d207375746174
	[577992.891101] x13: 73203a524f525245 x12: 20373439343a6565
	[577992.891103] x11: 0000000000000038 x10: 0101010101010101
	[577992.891106] x9 : ffffa07c68a85d70 x8 : 7f7f7f7f7f7f7f7f
	[577992.891109] x7 : 0000000000000000 x6 : 0000000000000080
	[577992.891110] x5 : 0000000000000000 x4 : 0000000000000002
	[577992.891112] x3 : ffff000001713390 x2 : 2ff90f88b1c22f00
	[577992.891114] x1 : ffff807bd39abc30 x0 : 0000000000000000
	[577992.891116] Call trace:
	[577992.891139]  _ocfs2_free_suballoc_bits+0x63c/0x968 [ocfs2]
	[577992.891162]  _ocfs2_free_clusters+0x100/0x290 [ocfs2]
	[577992.891185]  ocfs2_free_clusters+0x50/0x68 [ocfs2]
	[577992.891206]  ocfs2_add_clusters_in_btree+0x198/0x5e0 [ocfs2]
	[577992.891227]  ocfs2_add_inode_data+0x94/0xc8 [ocfs2]
	[577992.891248]  ocfs2_extend_allocation+0x1bc/0x7a8 [ocfs2]
	[577992.891269]  ocfs2_allocate_extents+0x14c/0x338 [ocfs2]
	[577992.891290]  __ocfs2_change_file_space+0x3f8/0x610 [ocfs2]
	[577992.891309]  ocfs2_fallocate+0xe4/0x128 [ocfs2]
	[577992.891316]  vfs_fallocate+0x11c/0x250
	[577992.891317]  ksys_fallocate+0x54/0x88
	[577992.891319]  __arm64_sys_fallocate+0x28/0x38
	[577992.891323]  el0_svc_common+0x78/0x130
	[577992.891325]  el0_svc_handler+0x38/0x78
	[577992.891327]  el0_svc+0x8/0xc

My analysis process as follows:
ocfs2_fallocate
  __ocfs2_change_file_space
    ocfs2_allocate_extents
      ocfs2_extend_allocation
        ocfs2_add_inode_data
          ocfs2_add_clusters_in_btree
            ocfs2_insert_extent
              ocfs2_do_insert_extent
                ocfs2_rotate_tree_right
                  ocfs2_extend_rotate_transaction
                    ocfs2_extend_trans
                      jbd2_journal_restart
                        jbd2__journal_restart
                          /* handle->h_transaction is NULL,
                           * is_handle_aborted(handle) is true
                           */
                          handle->h_transaction = NULL;
                          start_this_handle
                            return -EROFS;
            ocfs2_free_clusters
              _ocfs2_free_clusters
                _ocfs2_free_suballoc_bits
                  ocfs2_block_group_clear_bits
                    ocfs2_journal_access_gd
                      __ocfs2_journal_access
                        jbd2_journal_get_undo_access
                          /* I think jbd2_write_access_granted() will
                           * return true, because do_get_write_access()
                           * will return -EROFS.
                           */
                          if (jbd2_write_access_granted(...)) return 0;
                          do_get_write_access
                            /* handle->h_transaction is NULL, it will
                             * return -EROFS here, so do_get_write_access()
                             * was not called.
                             */
                            if (is_handle_aborted(handle)) return -EROFS;
                    /* bh2jh(group_bh) is NULL, caused NULL
                       pointer dereference */
                    undo_bg = (struct ocfs2_group_desc *)
                                bh2jh(group_bh)->b_committed_data;

If handle->h_transaction == NULL, then jbd2_write_access_granted()
does not really guarantee that journal_head will stay around,
not even speaking of its b_committed_data. The bh2jh(group_bh)
can be removed after ocfs2_journal_access_gd() and before call
"bh2jh(group_bh)->b_committed_data". So, we should move
is_handle_aborted() check from do_get_write_access() into
jbd2_journal_get_undo_access() and jbd2_journal_get_write_access()
before the call to jbd2_write_access_granted().

Link: https://lore.kernel.org/r/f72a623f-b3f1-381a-d91d-d22a1c83a336@huawei.com
Signed-off-by: Yan Wang <wangyan122@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
---
 fs/jbd2/transaction.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 2dd848a743ed..d181948c0390 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -936,8 +936,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
 	char *frozen_buffer = NULL;
 	unsigned long start_lock, time_lock;
 
-	if (is_handle_aborted(handle))
-		return -EROFS;
 	journal = transaction->t_journal;
 
 	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -1189,6 +1187,9 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	int rc;
 
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
 	if (jbd2_write_access_granted(handle, bh, false))
 		return 0;
 
@@ -1326,6 +1327,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	char *committed_data = NULL;
 
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
 	if (jbd2_write_access_granted(handle, bh, true))
 		return 0;
 

From 9db176bceb5c5df4990486709da386edadc6bd1d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 21 Feb 2020 11:08:35 +0100
Subject: [PATCH 154/239] ext4: fix mount failure with quota configured as
 module

When CONFIG_QFMT_V2 is configured as a module, the test in
ext4_feature_set_ok() fails and so mount of filesystems with quota or
project features fails. Fix the test to use IS_ENABLED macro which
works properly even for modules.

Link: https://lore.kernel.org/r/20200221100835.9332-1-jack@suse.cz
Fixes: d65d87a07476 ("ext4: improve explanation of a mount failure caused by a misconfigured kernel")
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6928fc229799..ff1b764b0c0e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3034,7 +3034,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 		return 0;
 	}
 
-#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2)
+#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
 	if (!readonly && (ext4_has_feature_quota(sb) ||
 			  ext4_has_feature_project(sb))) {
 		ext4_msg(sb, KERN_ERR,

From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Tue, 11 Feb 2020 23:20:43 +0100
Subject: [PATCH 155/239] netfilter: ipset: Fix "INFO: rcu detected stall in
 hash_xxx" reports

In the case of huge hash:* types of sets, due to the single spinlock of
a set the processing of the whole set under spinlock protection could take
too long.

There were four places where the whole hash table of the set was processed
from bucket to bucket under holding the spinlock:

- During resizing a set, the original set was locked to exclude kernel side
  add/del element operations (userspace add/del is excluded by the
  nfnetlink mutex). The original set is actually just read during the
  resize, so the spinlocking is replaced with rcu locking of regions.
  However, thus there can be parallel kernel side add/del of entries.
  In order not to loose those operations a backlog is added and replayed
  after the successful resize.
- Garbage collection of timed out entries was also protected by the spinlock.
  In order not to lock too long, region locking is introduced and a single
  region is processed in one gc go. Also, the simple timer based gc running
  is replaced with a workqueue based solution. The internal book-keeping
  (number of elements, size of extensions) is moved to region level due to
  the region locking.
- Adding elements: when the max number of the elements is reached, the gc
  was called to evict the timed out entries. The new approach is that the gc
  is called just for the matching region, assuming that if the region
  (proportionally) seems to be full, then the whole set does. We could scan
  the other regions to check every entry under rcu locking, but for huge
  sets it'd mean a slowdown at adding elements.
- Listing the set header data: when the set was defined with timeout
  support, the garbage collector was called to clean up timed out entries
  to get the correct element numbers and set size values. Now the set is
  scanned to check non-timed out entries, without actually calling the gc
  for the whole set.

Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
SOFTIRQ-unsafe lock order issues during working on the patch.

Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com
Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com
Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com
Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7")
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h |  11 +-
 net/netfilter/ipset/ip_set_core.c      |  34 +-
 net/netfilter/ipset/ip_set_hash_gen.h  | 629 +++++++++++++++++--------
 3 files changed, 470 insertions(+), 204 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 908d38dbcb91..5448c8b443db 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -121,6 +121,7 @@ struct ip_set_ext {
 	u32 timeout;
 	u8 packets_op;
 	u8 bytes_op;
+	bool target;
 };
 
 struct ip_set;
@@ -187,6 +188,14 @@ struct ip_set_type_variant {
 	/* Return true if "b" set is the same as "a"
 	 * according to the create set parameters */
 	bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+	/* Region-locking is used */
+	bool region_lock;
+};
+
+struct ip_set_region {
+	spinlock_t lock;	/* Region lock */
+	size_t ext_size;	/* Size of the dynamic extensions */
+	u32 elements;		/* Number of elements vs timeout */
 };
 
 /* The core set type structure */
@@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
-	{ .bytes = (skb)->len, .packets = 1,		\
+	{ .bytes = (skb)->len, .packets = 1, .target = true,\
 	  .timeout = ip_set_adt_opt_timeout(opt, set) }
 
 #define IP_SET_INIT_UEXT(set)				\
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 69c107f9ba8d..8dd17589217d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
 	return set;
 }
 
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+	if (!set->variant->region_lock)
+		spin_lock_bh(&set->lock);
+}
+
+static inline void
+ip_set_unlock(struct ip_set *set)
+{
+	if (!set->variant->region_lock)
+		spin_unlock_bh(&set->lock);
+}
+
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
@@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	if (ret == -EAGAIN) {
 		/* Type requests element to be completed */
 		pr_debug("element must be completed, ADD is triggered\n");
-		spin_lock_bh(&set->lock);
+		ip_set_lock(set);
 		set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-		spin_unlock_bh(&set->lock);
+		ip_set_unlock(set);
 		ret = 1;
 	} else {
 		/* --return-nomatch: invert matched element */
@@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 
 	return ret;
 }
@@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	    !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 
 	return ret;
 }
@@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set)
 {
 	pr_debug("set: %s\n",  set->name);
 
-	spin_lock_bh(&set->lock);
+	ip_set_lock(set);
 	set->variant->flush(set);
-	spin_unlock_bh(&set->lock);
+	ip_set_unlock(set);
 }
 
 static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
@@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
 	bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
 
 	do {
-		spin_lock_bh(&set->lock);
+		ip_set_lock(set);
 		ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
-		spin_unlock_bh(&set->lock);
+		ip_set_unlock(set);
 		retried = true;
 	} while (ret == -EAGAIN &&
 		 set->variant->resize &&
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7480ce55b5c8..71e93eac0831 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -7,13 +7,21 @@
 #include <linux/rcupdate.h>
 #include <linux/jhash.h>
 #include <linux/types.h>
+#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/ipset/ip_set.h>
 
-#define __ipset_dereference_protected(p, c)	rcu_dereference_protected(p, c)
-#define ipset_dereference_protected(p, set) \
-	__ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
-
-#define rcu_dereference_bh_nfnl(p)	rcu_dereference_bh_check(p, 1)
+#define __ipset_dereference(p)		\
+	rcu_dereference_protected(p, 1)
+#define ipset_dereference_nfnl(p)	\
+	rcu_dereference_protected(p,	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+#define ipset_dereference_set(p, set) 	\
+	rcu_dereference_protected(p,	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
+		lockdep_is_held(&(set)->lock))
+#define ipset_dereference_bh_nfnl(p)	\
+	rcu_dereference_bh_check(p, 	\
+		lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
 
 /* Hashing which uses arrays to resolve clashing. The hash table is resized
  * (doubled) when searching becomes too long.
@@ -72,11 +80,35 @@ struct hbucket {
 		__aligned(__alignof__(u64));
 };
 
+/* Region size for locking == 2^HTABLE_REGION_BITS */
+#define HTABLE_REGION_BITS	10
+#define ahash_numof_locks(htable_bits)		\
+	((htable_bits) < HTABLE_REGION_BITS ? 1	\
+		: jhash_size((htable_bits) - HTABLE_REGION_BITS))
+#define ahash_sizeof_regions(htable_bits)		\
+	(ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
+#define ahash_region(n, htable_bits)		\
+	((n) % ahash_numof_locks(htable_bits))
+#define ahash_bucket_start(h,  htable_bits)	\
+	((htable_bits) < HTABLE_REGION_BITS ? 0	\
+		: (h) * jhash_size(HTABLE_REGION_BITS))
+#define ahash_bucket_end(h,  htable_bits)	\
+	((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits)	\
+		: ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
+
+struct htable_gc {
+	struct delayed_work dwork;
+	struct ip_set *set;	/* Set the gc belongs to */
+	u32 region;		/* Last gc run position */
+};
+
 /* The hash table: the table size stored here in order to make resizing easy */
 struct htable {
 	atomic_t ref;		/* References for resizing */
-	atomic_t uref;		/* References for dumping */
+	atomic_t uref;		/* References for dumping and gc */
 	u8 htable_bits;		/* size of hash table == 2^htable_bits */
+	u32 maxelem;		/* Maxelem per region */
+	struct ip_set_region *hregion;	/* Region locks and ext sizes */
 	struct hbucket __rcu *bucket[0]; /* hashtable buckets */
 };
 
@@ -162,6 +194,10 @@ htable_bits(u32 hashsize)
 #define NLEN			0
 #endif /* IP_SET_HASH_WITH_NETS */
 
+#define SET_ELEM_EXPIRED(set, d)	\
+	(SET_WITH_TIMEOUT(set) &&	\
+	 ip_set_timeout_expired(ext_timeout(d, set)))
+
 #endif /* _IP_SET_HASH_GEN_H */
 
 #ifndef MTYPE
@@ -205,10 +241,12 @@ htable_bits(u32 hashsize)
 #undef mtype_test_cidrs
 #undef mtype_test
 #undef mtype_uref
-#undef mtype_expire
 #undef mtype_resize
+#undef mtype_ext_size
+#undef mtype_resize_ad
 #undef mtype_head
 #undef mtype_list
+#undef mtype_gc_do
 #undef mtype_gc
 #undef mtype_gc_init
 #undef mtype_variant
@@ -247,10 +285,12 @@ htable_bits(u32 hashsize)
 #define mtype_test_cidrs	IPSET_TOKEN(MTYPE, _test_cidrs)
 #define mtype_test		IPSET_TOKEN(MTYPE, _test)
 #define mtype_uref		IPSET_TOKEN(MTYPE, _uref)
-#define mtype_expire		IPSET_TOKEN(MTYPE, _expire)
 #define mtype_resize		IPSET_TOKEN(MTYPE, _resize)
+#define mtype_ext_size		IPSET_TOKEN(MTYPE, _ext_size)
+#define mtype_resize_ad		IPSET_TOKEN(MTYPE, _resize_ad)
 #define mtype_head		IPSET_TOKEN(MTYPE, _head)
 #define mtype_list		IPSET_TOKEN(MTYPE, _list)
+#define mtype_gc_do		IPSET_TOKEN(MTYPE, _gc_do)
 #define mtype_gc		IPSET_TOKEN(MTYPE, _gc)
 #define mtype_gc_init		IPSET_TOKEN(MTYPE, _gc_init)
 #define mtype_variant		IPSET_TOKEN(MTYPE, _variant)
@@ -275,8 +315,7 @@ htable_bits(u32 hashsize)
 /* The generic hash structure */
 struct htype {
 	struct htable __rcu *table; /* the hash table */
-	struct timer_list gc;	/* garbage collection when timeout enabled */
-	struct ip_set *set;	/* attached to this ip_set */
+	struct htable_gc gc;	/* gc workqueue */
 	u32 maxelem;		/* max elements in the hash */
 	u32 initval;		/* random jhash init value */
 #ifdef IP_SET_HASH_WITH_MARKMASK
@@ -288,21 +327,33 @@ struct htype {
 #ifdef IP_SET_HASH_WITH_NETMASK
 	u8 netmask;		/* netmask value for subnets to store */
 #endif
+	struct list_head ad;	/* Resize add|del backlist */
 	struct mtype_elem next; /* temporary storage for uadd */
 #ifdef IP_SET_HASH_WITH_NETS
 	struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
 #endif
 };
 
+/* ADD|DEL entries saved during resize */
+struct mtype_resize_ad {
+	struct list_head list;
+	enum ipset_adt ad;	/* ADD|DEL element */
+	struct mtype_elem d;	/* Element value */
+	struct ip_set_ext ext;	/* Extensions for ADD */
+	struct ip_set_ext mext;	/* Target extensions for ADD */
+	u32 flags;		/* Flags for ADD */
+};
+
 #ifdef IP_SET_HASH_WITH_NETS
 /* Network cidr size book keeping when the hash stores different
  * sized networks. cidr == real cidr + 1 to support /0.
  */
 static void
-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
 {
 	int i, j;
 
+	spin_lock_bh(&set->lock);
 	/* Add in increasing prefix order, so larger cidr first */
 	for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
 		if (j != -1) {
@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
 			j = i;
 		} else if (h->nets[i].cidr[n] == cidr) {
 			h->nets[CIDR_POS(cidr)].nets[n]++;
-			return;
+			goto unlock;
 		}
 	}
 	if (j != -1) {
@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
 	}
 	h->nets[i].cidr[n] = cidr;
 	h->nets[CIDR_POS(cidr)].nets[n] = 1;
+unlock:
+	spin_unlock_bh(&set->lock);
 }
 
 static void
-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
+mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
 {
 	u8 i, j, net_end = NLEN - 1;
 
+	spin_lock_bh(&set->lock);
 	for (i = 0; i < NLEN; i++) {
 		if (h->nets[i].cidr[n] != cidr)
 			continue;
 		h->nets[CIDR_POS(cidr)].nets[n]--;
 		if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
-			return;
+			goto unlock;
 		for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
 			h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
 		h->nets[j].cidr[n] = 0;
-		return;
+		goto unlock;
 	}
+unlock:
+	spin_unlock_bh(&set->lock);
 }
 #endif
 
@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
 static size_t
 mtype_ahash_memsize(const struct htype *h, const struct htable *t)
 {
-	return sizeof(*h) + sizeof(*t);
+	return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
 }
 
 /* Get the ith element from the array block n */
@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set)
 	struct htype *h = set->data;
 	struct htable *t;
 	struct hbucket *n;
-	u32 i;
+	u32 r, i;
 
-	t = ipset_dereference_protected(h->table, set);
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
-		if (!n)
-			continue;
-		if (set->extensions & IPSET_EXT_DESTROY)
-			mtype_ext_cleanup(set, n);
-		/* FIXME: use slab cache */
-		rcu_assign_pointer(hbucket(t, i), NULL);
-		kfree_rcu(n, rcu);
+	t = ipset_dereference_nfnl(h->table);
+	for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+		spin_lock_bh(&t->hregion[r].lock);
+		for (i = ahash_bucket_start(r, t->htable_bits);
+		     i < ahash_bucket_end(r, t->htable_bits); i++) {
+			n = __ipset_dereference(hbucket(t, i));
+			if (!n)
+				continue;
+			if (set->extensions & IPSET_EXT_DESTROY)
+				mtype_ext_cleanup(set, n);
+			/* FIXME: use slab cache */
+			rcu_assign_pointer(hbucket(t, i), NULL);
+			kfree_rcu(n, rcu);
+		}
+		t->hregion[r].ext_size = 0;
+		t->hregion[r].elements = 0;
+		spin_unlock_bh(&t->hregion[r].lock);
 	}
 #ifdef IP_SET_HASH_WITH_NETS
 	memset(h->nets, 0, sizeof(h->nets));
 #endif
-	set->elements = 0;
-	set->ext_size = 0;
 }
 
 /* Destroy the hashtable part of the set */
@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
 	u32 i;
 
 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
+		n = __ipset_dereference(hbucket(t, i));
 		if (!n)
 			continue;
 		if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy)
 		kfree(n);
 	}
 
+	ip_set_free(t->hregion);
 	ip_set_free(t);
 }
 
@@ -414,28 +476,21 @@ static void
 mtype_destroy(struct ip_set *set)
 {
 	struct htype *h = set->data;
+	struct list_head *l, *lt;
 
 	if (SET_WITH_TIMEOUT(set))
-		del_timer_sync(&h->gc);
+		cancel_delayed_work_sync(&h->gc.dwork);
 
-	mtype_ahash_destroy(set,
-			    __ipset_dereference_protected(h->table, 1), true);
+	mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
+	list_for_each_safe(l, lt, &h->ad) {
+		list_del(l);
+		kfree(l);
+	}
 	kfree(h);
 
 	set->data = NULL;
 }
 
-static void
-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
-{
-	struct htype *h = set->data;
-
-	timer_setup(&h->gc, gc, 0);
-	mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
-	pr_debug("gc initialized, run in every %u\n",
-		 IPSET_GC_PERIOD(set->timeout));
-}
-
 static bool
 mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 {
@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
 	       a->extensions == b->extensions;
 }
 
-/* Delete expired elements from the hashtable */
 static void
-mtype_expire(struct ip_set *set, struct htype *h)
+mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
 {
-	struct htable *t;
 	struct hbucket *n, *tmp;
 	struct mtype_elem *data;
 	u32 i, j, d;
@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h)
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 k;
 #endif
+	u8 htable_bits = t->htable_bits;
 
-	t = ipset_dereference_protected(h->table, set);
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(t, i), 1);
+	spin_lock_bh(&t->hregion[r].lock);
+	for (i = ahash_bucket_start(r, htable_bits);
+	     i < ahash_bucket_end(r, htable_bits); i++) {
+		n = __ipset_dereference(hbucket(t, i));
 		if (!n)
 			continue;
 		for (j = 0, d = 0; j < n->pos; j++) {
@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h)
 			smp_mb__after_atomic();
 #ifdef IP_SET_HASH_WITH_NETS
 			for (k = 0; k < IPSET_NET_COUNT; k++)
-				mtype_del_cidr(h,
+				mtype_del_cidr(set, h,
 					NCIDR_PUT(DCIDR_GET(data->cidr, k)),
 					k);
 #endif
+			t->hregion[r].elements--;
 			ip_set_ext_destroy(set, data);
-			set->elements--;
 			d++;
 		}
 		if (d >= AHASH_INIT_SIZE) {
 			if (d >= n->size) {
+				t->hregion[r].ext_size -=
+					ext_size(n->size, dsize);
 				rcu_assign_pointer(hbucket(t, i), NULL);
 				kfree_rcu(n, rcu);
 				continue;
 			}
 			tmp = kzalloc(sizeof(*tmp) +
-				      (n->size - AHASH_INIT_SIZE) * dsize,
-				      GFP_ATOMIC);
+				(n->size - AHASH_INIT_SIZE) * dsize,
+				GFP_ATOMIC);
 			if (!tmp)
-				/* Still try to delete expired elements */
+				/* Still try to delete expired elements. */
 				continue;
 			tmp->size = n->size - AHASH_INIT_SIZE;
 			for (j = 0, d = 0; j < n->pos; j++) {
 				if (!test_bit(j, n->used))
 					continue;
 				data = ahash_data(n, j, dsize);
-				memcpy(tmp->value + d * dsize, data, dsize);
+				memcpy(tmp->value + d * dsize,
+				       data, dsize);
 				set_bit(d, tmp->used);
 				d++;
 			}
 			tmp->pos = d;
-			set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+			t->hregion[r].ext_size -=
+				ext_size(AHASH_INIT_SIZE, dsize);
 			rcu_assign_pointer(hbucket(t, i), tmp);
 			kfree_rcu(n, rcu);
 		}
 	}
+	spin_unlock_bh(&t->hregion[r].lock);
 }
 
 static void
-mtype_gc(struct timer_list *t)
+mtype_gc(struct work_struct *work)
 {
-	struct htype *h = from_timer(h, t, gc);
-	struct ip_set *set = h->set;
+	struct htable_gc *gc;
+	struct ip_set *set;
+	struct htype *h;
+	struct htable *t;
+	u32 r, numof_locks;
+	unsigned int next_run;
+
+	gc = container_of(work, struct htable_gc, dwork.work);
+	set = gc->set;
+	h = set->data;
 
-	pr_debug("called\n");
 	spin_lock_bh(&set->lock);
-	mtype_expire(set, h);
+	t = ipset_dereference_set(h->table, set);
+	atomic_inc(&t->uref);
+	numof_locks = ahash_numof_locks(t->htable_bits);
+	r = gc->region++;
+	if (r >= numof_locks) {
+		r = gc->region = 0;
+	}
+	next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
+	if (next_run < HZ/10)
+		next_run = HZ/10;
 	spin_unlock_bh(&set->lock);
 
-	h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
-	add_timer(&h->gc);
+	mtype_gc_do(set, h, t, r);
+
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by expire: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
+
+	queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
+
 }
 
+static void
+mtype_gc_init(struct htable_gc *gc)
+{
+	INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
+	queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags);
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags);
+
 /* Resize a hash: create a new hash table with doubling the hashsize
  * and inserting the elements to it. Repeat until we succeed or
  * fail due to memory pressures.
@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct htype *h = set->data;
 	struct htable *t, *orig;
 	u8 htable_bits;
-	size_t extsize, dsize = set->dsize;
+	size_t dsize = set->dsize;
 #ifdef IP_SET_HASH_WITH_NETS
 	u8 flags;
 	struct mtype_elem *tmp;
@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried)
 	struct mtype_elem *data;
 	struct mtype_elem *d;
 	struct hbucket *n, *m;
-	u32 i, j, key;
+	struct list_head *l, *lt;
+	struct mtype_resize_ad *x;
+	u32 i, j, r, nr, key;
 	int ret;
 
 #ifdef IP_SET_HASH_WITH_NETS
@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried)
 	if (!tmp)
 		return -ENOMEM;
 #endif
-	rcu_read_lock_bh();
-	orig = rcu_dereference_bh_nfnl(h->table);
+	orig = ipset_dereference_bh_nfnl(h->table);
 	htable_bits = orig->htable_bits;
-	rcu_read_unlock_bh();
 
 retry:
 	ret = 0;
@@ -583,88 +680,124 @@ mtype_resize(struct ip_set *set, bool retried)
 		ret = -ENOMEM;
 		goto out;
 	}
+	t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
+	if (!t->hregion) {
+		kfree(t);
+		ret = -ENOMEM;
+		goto out;
+	}
 	t->htable_bits = htable_bits;
+	t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
+	for (i = 0; i < ahash_numof_locks(htable_bits); i++)
+		spin_lock_init(&t->hregion[i].lock);
 
-	spin_lock_bh(&set->lock);
-	orig = __ipset_dereference_protected(h->table, 1);
-	/* There can't be another parallel resizing, but dumping is possible */
+	/* There can't be another parallel resizing,
+	 * but dumping, gc, kernel side add/del are possible
+	 */
+	orig = ipset_dereference_bh_nfnl(h->table);
 	atomic_set(&orig->ref, 1);
 	atomic_inc(&orig->uref);
-	extsize = 0;
 	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
 		 set->name, orig->htable_bits, htable_bits, orig);
-	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
-		n = __ipset_dereference_protected(hbucket(orig, i), 1);
-		if (!n)
-			continue;
-		for (j = 0; j < n->pos; j++) {
-			if (!test_bit(j, n->used))
+	for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
+		/* Expire may replace a hbucket with another one */
+		rcu_read_lock_bh();
+		for (i = ahash_bucket_start(r, orig->htable_bits);
+		     i < ahash_bucket_end(r, orig->htable_bits); i++) {
+			n = __ipset_dereference(hbucket(orig, i));
+			if (!n)
 				continue;
-			data = ahash_data(n, j, dsize);
+			for (j = 0; j < n->pos; j++) {
+				if (!test_bit(j, n->used))
+					continue;
+				data = ahash_data(n, j, dsize);
+				if (SET_ELEM_EXPIRED(set, data))
+					continue;
 #ifdef IP_SET_HASH_WITH_NETS
-			/* We have readers running parallel with us,
-			 * so the live data cannot be modified.
-			 */
-			flags = 0;
-			memcpy(tmp, data, dsize);
-			data = tmp;
-			mtype_data_reset_flags(data, &flags);
+				/* We have readers running parallel with us,
+				 * so the live data cannot be modified.
+				 */
+				flags = 0;
+				memcpy(tmp, data, dsize);
+				data = tmp;
+				mtype_data_reset_flags(data, &flags);
 #endif
-			key = HKEY(data, h->initval, htable_bits);
-			m = __ipset_dereference_protected(hbucket(t, key), 1);
-			if (!m) {
-				m = kzalloc(sizeof(*m) +
+				key = HKEY(data, h->initval, htable_bits);
+				m = __ipset_dereference(hbucket(t, key));
+				nr = ahash_region(key, htable_bits);
+				if (!m) {
+					m = kzalloc(sizeof(*m) +
 					    AHASH_INIT_SIZE * dsize,
 					    GFP_ATOMIC);
-				if (!m) {
-					ret = -ENOMEM;
-					goto cleanup;
-				}
-				m->size = AHASH_INIT_SIZE;
-				extsize += ext_size(AHASH_INIT_SIZE, dsize);
-				RCU_INIT_POINTER(hbucket(t, key), m);
-			} else if (m->pos >= m->size) {
-				struct hbucket *ht;
+					if (!m) {
+						ret = -ENOMEM;
+						goto cleanup;
+					}
+					m->size = AHASH_INIT_SIZE;
+					t->hregion[nr].ext_size +=
+						ext_size(AHASH_INIT_SIZE,
+							 dsize);
+					RCU_INIT_POINTER(hbucket(t, key), m);
+				} else if (m->pos >= m->size) {
+					struct hbucket *ht;
 
-				if (m->size >= AHASH_MAX(h)) {
-					ret = -EAGAIN;
-				} else {
-					ht = kzalloc(sizeof(*ht) +
+					if (m->size >= AHASH_MAX(h)) {
+						ret = -EAGAIN;
+					} else {
+						ht = kzalloc(sizeof(*ht) +
 						(m->size + AHASH_INIT_SIZE)
 						* dsize,
 						GFP_ATOMIC);
-					if (!ht)
-						ret = -ENOMEM;
+						if (!ht)
+							ret = -ENOMEM;
+					}
+					if (ret < 0)
+						goto cleanup;
+					memcpy(ht, m, sizeof(struct hbucket) +
+					       m->size * dsize);
+					ht->size = m->size + AHASH_INIT_SIZE;
+					t->hregion[nr].ext_size +=
+						ext_size(AHASH_INIT_SIZE,
+							 dsize);
+					kfree(m);
+					m = ht;
+					RCU_INIT_POINTER(hbucket(t, key), ht);
 				}
-				if (ret < 0)
-					goto cleanup;
-				memcpy(ht, m, sizeof(struct hbucket) +
-					      m->size * dsize);
-				ht->size = m->size + AHASH_INIT_SIZE;
-				extsize += ext_size(AHASH_INIT_SIZE, dsize);
-				kfree(m);
-				m = ht;
-				RCU_INIT_POINTER(hbucket(t, key), ht);
-			}
-			d = ahash_data(m, m->pos, dsize);
-			memcpy(d, data, dsize);
-			set_bit(m->pos++, m->used);
+				d = ahash_data(m, m->pos, dsize);
+				memcpy(d, data, dsize);
+				set_bit(m->pos++, m->used);
+				t->hregion[nr].elements++;
 #ifdef IP_SET_HASH_WITH_NETS
-			mtype_data_reset_flags(d, &flags);
+				mtype_data_reset_flags(d, &flags);
 #endif
+			}
 		}
+		rcu_read_unlock_bh();
 	}
-	rcu_assign_pointer(h->table, t);
-	set->ext_size = extsize;
 
-	spin_unlock_bh(&set->lock);
+	/* There can't be any other writer. */
+	rcu_assign_pointer(h->table, t);
 
 	/* Give time to other readers of the set */
 	synchronize_rcu();
 
 	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
 		 orig->htable_bits, orig, t->htable_bits, t);
-	/* If there's nobody else dumping the table, destroy it */
+	/* Add/delete elements processed by the SET target during resize.
+	 * Kernel-side add cannot trigger a resize and userspace actions
+	 * are serialized by the mutex.
+	 */
+	list_for_each_safe(l, lt, &h->ad) {
+		x = list_entry(l, struct mtype_resize_ad, list);
+		if (x->ad == IPSET_ADD) {
+			mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
+		} else {
+			mtype_del(set, &x->d, NULL, NULL, 0);
+		}
+		list_del(l);
+		kfree(l);
+	}
+	/* If there's nobody else using the table, destroy it */
 	if (atomic_dec_and_test(&orig->uref)) {
 		pr_debug("Table destroy by resize %p\n", orig);
 		mtype_ahash_destroy(set, orig, false);
@@ -677,15 +810,44 @@ mtype_resize(struct ip_set *set, bool retried)
 	return ret;
 
 cleanup:
+	rcu_read_unlock_bh();
 	atomic_set(&orig->ref, 0);
 	atomic_dec(&orig->uref);
-	spin_unlock_bh(&set->lock);
 	mtype_ahash_destroy(set, t, false);
 	if (ret == -EAGAIN)
 		goto retry;
 	goto out;
 }
 
+/* Get the current number of elements and ext_size in the set  */
+static void
+mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
+{
+	struct htype *h = set->data;
+	const struct htable *t;
+	u32 i, j, r;
+	struct hbucket *n;
+	struct mtype_elem *data;
+
+	t = rcu_dereference_bh(h->table);
+	for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
+		for (i = ahash_bucket_start(r, t->htable_bits);
+		     i < ahash_bucket_end(r, t->htable_bits); i++) {
+			n = rcu_dereference_bh(hbucket(t, i));
+			if (!n)
+				continue;
+			for (j = 0; j < n->pos; j++) {
+				if (!test_bit(j, n->used))
+					continue;
+				data = ahash_data(n, j, set->dsize);
+				if (!SET_ELEM_EXPIRED(set, data))
+					(*elements)++;
+			}
+		}
+		*ext_size += t->hregion[r].ext_size;
+	}
+}
+
 /* Add an element to a hash and update the internal counters when succeeded,
  * otherwise report the proper error code.
  */
@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	const struct mtype_elem *d = value;
 	struct mtype_elem *data;
 	struct hbucket *n, *old = ERR_PTR(-ENOENT);
-	int i, j = -1;
+	int i, j = -1, ret;
 	bool flag_exist = flags & IPSET_FLAG_EXIST;
 	bool deleted = false, forceadd = false, reuse = false;
-	u32 key, multi = 0;
+	u32 r, key, multi = 0, elements, maxelem;
 
-	if (set->elements >= h->maxelem) {
-		if (SET_WITH_TIMEOUT(set))
-			/* FIXME: when set is full, we slow down here */
-			mtype_expire(set, h);
-		if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
+	rcu_read_lock_bh();
+	t = rcu_dereference_bh(h->table);
+	key = HKEY(value, h->initval, t->htable_bits);
+	r = ahash_region(key, t->htable_bits);
+	atomic_inc(&t->uref);
+	elements = t->hregion[r].elements;
+	maxelem = t->maxelem;
+	if (elements >= maxelem) {
+		u32 e;
+		if (SET_WITH_TIMEOUT(set)) {
+			rcu_read_unlock_bh();
+			mtype_gc_do(set, h, t, r);
+			rcu_read_lock_bh();
+		}
+		maxelem = h->maxelem;
+		elements = 0;
+		for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
+			elements += t->hregion[e].elements;
+		if (elements >= maxelem && SET_WITH_FORCEADD(set))
 			forceadd = true;
 	}
+	rcu_read_unlock_bh();
 
-	t = ipset_dereference_protected(h->table, set);
-	key = HKEY(value, h->initval, t->htable_bits);
-	n = __ipset_dereference_protected(hbucket(t, key), 1);
+	spin_lock_bh(&t->hregion[r].lock);
+	n = rcu_dereference_bh(hbucket(t, key));
 	if (!n) {
-		if (forceadd || set->elements >= h->maxelem)
+		if (forceadd || elements >= maxelem)
 			goto set_full;
 		old = NULL;
 		n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
 			    GFP_ATOMIC);
-		if (!n)
-			return -ENOMEM;
+		if (!n) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
 		n->size = AHASH_INIT_SIZE;
-		set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+		t->hregion[r].ext_size +=
+			ext_size(AHASH_INIT_SIZE, set->dsize);
 		goto copy_elem;
 	}
 	for (i = 0; i < n->pos; i++) {
@@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		}
 		data = ahash_data(n, i, set->dsize);
 		if (mtype_data_equal(data, d, &multi)) {
-			if (flag_exist ||
-			    (SET_WITH_TIMEOUT(set) &&
-			     ip_set_timeout_expired(ext_timeout(data, set)))) {
+			if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
 				/* Just the extensions could be overwritten */
 				j = i;
 				goto overwrite_extensions;
 			}
-			return -IPSET_ERR_EXIST;
+			ret = -IPSET_ERR_EXIST;
+			goto unlock;
 		}
 		/* Reuse first timed out entry */
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(data, set)) &&
-		    j == -1) {
+		if (SET_ELEM_EXPIRED(set, data) && j == -1) {
 			j = i;
 			reuse = true;
 		}
@@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (!deleted) {
 #ifdef IP_SET_HASH_WITH_NETS
 			for (i = 0; i < IPSET_NET_COUNT; i++)
-				mtype_del_cidr(h,
+				mtype_del_cidr(set, h,
 					NCIDR_PUT(DCIDR_GET(data->cidr, i)),
 					i);
 #endif
 			ip_set_ext_destroy(set, data);
-			set->elements--;
+			t->hregion[r].elements--;
 		}
 		goto copy_data;
 	}
-	if (set->elements >= h->maxelem)
+	if (elements >= maxelem)
 		goto set_full;
 	/* Create a new slot */
 	if (n->pos >= n->size) {
@@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (n->size >= AHASH_MAX(h)) {
 			/* Trigger rehashing */
 			mtype_data_next(&h->next, d);
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto resize;
 		}
 		old = n;
 		n = kzalloc(sizeof(*n) +
 			    (old->size + AHASH_INIT_SIZE) * set->dsize,
 			    GFP_ATOMIC);
-		if (!n)
-			return -ENOMEM;
+		if (!n) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
 		memcpy(n, old, sizeof(struct hbucket) +
 		       old->size * set->dsize);
 		n->size = old->size + AHASH_INIT_SIZE;
-		set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
+		t->hregion[r].ext_size +=
+			ext_size(AHASH_INIT_SIZE, set->dsize);
 	}
 
 copy_elem:
 	j = n->pos++;
 	data = ahash_data(n, j, set->dsize);
 copy_data:
-	set->elements++;
+	t->hregion[r].elements++;
 #ifdef IP_SET_HASH_WITH_NETS
 	for (i = 0; i < IPSET_NET_COUNT; i++)
-		mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+		mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
 #endif
 	memcpy(data, d, sizeof(struct mtype_elem));
 overwrite_extensions:
@@ -820,13 +1000,41 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (old)
 			kfree_rcu(old, rcu);
 	}
+	ret = 0;
+resize:
+	spin_unlock_bh(&t->hregion[r].lock);
+	if (atomic_read(&t->ref) && ext->target) {
+		/* Resize is in process and kernel side add, save values */
+		struct mtype_resize_ad *x;
+
+		x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
+		if (!x)
+			/* Don't bother */
+			goto out;
+		x->ad = IPSET_ADD;
+		memcpy(&x->d, value, sizeof(struct mtype_elem));
+		memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
+		memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
+		x->flags = flags;
+		spin_lock_bh(&set->lock);
+		list_add_tail(&x->list, &h->ad);
+		spin_unlock_bh(&set->lock);
+	}
+	goto out;
 
-	return 0;
 set_full:
 	if (net_ratelimit())
 		pr_warn("Set %s is full, maxelem %u reached\n",
-			set->name, h->maxelem);
-	return -IPSET_ERR_HASH_FULL;
+			set->name, maxelem);
+	ret = -IPSET_ERR_HASH_FULL;
+unlock:
+	spin_unlock_bh(&t->hregion[r].lock);
+out:
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by add: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
+	return ret;
 }
 
 /* Delete an element from the hash and free up space if possible.
@@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	const struct mtype_elem *d = value;
 	struct mtype_elem *data;
 	struct hbucket *n;
-	int i, j, k, ret = -IPSET_ERR_EXIST;
+	struct mtype_resize_ad *x = NULL;
+	int i, j, k, r, ret = -IPSET_ERR_EXIST;
 	u32 key, multi = 0;
 	size_t dsize = set->dsize;
 
-	t = ipset_dereference_protected(h->table, set);
+	/* Userspace add and resize is excluded by the mutex.
+	 * Kernespace add does not trigger resize.
+	 */
+	rcu_read_lock_bh();
+	t = rcu_dereference_bh(h->table);
 	key = HKEY(value, h->initval, t->htable_bits);
-	n = __ipset_dereference_protected(hbucket(t, key), 1);
+	r = ahash_region(key, t->htable_bits);
+	atomic_inc(&t->uref);
+	rcu_read_unlock_bh();
+
+	spin_lock_bh(&t->hregion[r].lock);
+	n = rcu_dereference_bh(hbucket(t, key));
 	if (!n)
 		goto out;
 	for (i = 0, k = 0; i < n->pos; i++) {
@@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		data = ahash_data(n, i, dsize);
 		if (!mtype_data_equal(data, d, &multi))
 			continue;
-		if (SET_WITH_TIMEOUT(set) &&
-		    ip_set_timeout_expired(ext_timeout(data, set)))
+		if (SET_ELEM_EXPIRED(set, data))
 			goto out;
 
 		ret = 0;
@@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		smp_mb__after_atomic();
 		if (i + 1 == n->pos)
 			n->pos--;
-		set->elements--;
+		t->hregion[r].elements--;
 #ifdef IP_SET_HASH_WITH_NETS
 		for (j = 0; j < IPSET_NET_COUNT; j++)
-			mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
-				       j);
+			mtype_del_cidr(set, h,
+				       NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
 #endif
 		ip_set_ext_destroy(set, data);
 
+		if (atomic_read(&t->ref) && ext->target) {
+			/* Resize is in process and kernel side del,
+			 * save values
+			 */
+			x = kzalloc(sizeof(struct mtype_resize_ad),
+				    GFP_ATOMIC);
+			if (x) {
+				x->ad = IPSET_DEL;
+				memcpy(&x->d, value,
+				       sizeof(struct mtype_elem));
+				x->flags = flags;
+			}
+		}
 		for (; i < n->pos; i++) {
 			if (!test_bit(i, n->used))
 				k++;
 		}
 		if (n->pos == 0 && k == 0) {
-			set->ext_size -= ext_size(n->size, dsize);
+			t->hregion[r].ext_size -= ext_size(n->size, dsize);
 			rcu_assign_pointer(hbucket(t, key), NULL);
 			kfree_rcu(n, rcu);
 		} else if (k >= AHASH_INIT_SIZE) {
@@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 				k++;
 			}
 			tmp->pos = k;
-			set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
+			t->hregion[r].ext_size -=
+				ext_size(AHASH_INIT_SIZE, dsize);
 			rcu_assign_pointer(hbucket(t, key), tmp);
 			kfree_rcu(n, rcu);
 		}
@@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	}
 
 out:
+	spin_unlock_bh(&t->hregion[r].lock);
+	if (x) {
+		spin_lock_bh(&set->lock);
+		list_add(&x->list, &h->ad);
+		spin_unlock_bh(&set->lock);
+	}
+	if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+		pr_debug("Table destroy after resize by del: %p\n", t);
+		mtype_ahash_destroy(set, t, false);
+	}
 	return ret;
 }
 
@@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 	int i, ret = 0;
 	u32 key, multi = 0;
 
+	rcu_read_lock_bh();
 	t = rcu_dereference_bh(h->table);
 #ifdef IP_SET_HASH_WITH_NETS
 	/* If we test an IP address and not a network address,
@@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 			goto out;
 	}
 out:
+	rcu_read_unlock_bh();
 	return ret;
 }
 
@@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 	const struct htable *t;
 	struct nlattr *nested;
 	size_t memsize;
+	u32 elements = 0;
+	size_t ext_size = 0;
 	u8 htable_bits;
 
-	/* If any members have expired, set->elements will be wrong
-	 * mytype_expire function will update it with the right count.
-	 * we do not hold set->lock here, so grab it first.
-	 * set->elements can still be incorrect in the case of a huge set,
-	 * because elements might time out during the listing.
-	 */
-	if (SET_WITH_TIMEOUT(set)) {
-		spin_lock_bh(&set->lock);
-		mtype_expire(set, h);
-		spin_unlock_bh(&set->lock);
-	}
-
 	rcu_read_lock_bh();
-	t = rcu_dereference_bh_nfnl(h->table);
-	memsize = mtype_ahash_memsize(h, t) + set->ext_size;
+	t = rcu_dereference_bh(h->table);
+	mtype_ext_size(set, &elements, &ext_size);
+	memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
 	htable_bits = t->htable_bits;
 	rcu_read_unlock_bh();
 
@@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
 #endif
 	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
 	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
-	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
+	    nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
 		goto nla_put_failure;
 	if (unlikely(ip_set_put_flags(skb, set)))
 		goto nla_put_failure;
@@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start)
 
 	if (start) {
 		rcu_read_lock_bh();
-		t = rcu_dereference_bh_nfnl(h->table);
+		t = ipset_dereference_bh_nfnl(h->table);
 		atomic_inc(&t->uref);
 		cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
 		rcu_read_unlock_bh();
 	} else if (cb->args[IPSET_CB_PRIVATE]) {
 		t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
 		if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
-			/* Resizing didn't destroy the hash table */
-			pr_debug("Table destroy by dump: %p\n", t);
+			pr_debug("Table destroy after resize "
+				 " by dump: %p\n", t);
 			mtype_ahash_destroy(set, t, false);
 		}
 		cb->args[IPSET_CB_PRIVATE] = 0;
@@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set,
 			if (!test_bit(i, n->used))
 				continue;
 			e = ahash_data(n, i, set->dsize);
-			if (SET_WITH_TIMEOUT(set) &&
-			    ip_set_timeout_expired(ext_timeout(e, set)))
+			if (SET_ELEM_EXPIRED(set, e))
 				continue;
 			pr_debug("list hash %lu hbucket %p i %u, data %p\n",
 				 cb->args[IPSET_CB_ARG0], n, i, e);
@@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = {
 	.uref	= mtype_uref,
 	.resize	= mtype_resize,
 	.same_set = mtype_same_set,
+	.region_lock = true,
 };
 
 #ifdef IP_SET_EMIT_CREATE
@@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	size_t hsize;
 	struct htype *h;
 	struct htable *t;
+	u32 i;
 
 	pr_debug("Create set %s with family %s\n",
 		 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
@@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 		kfree(h);
 		return -ENOMEM;
 	}
+	t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
+	if (!t->hregion) {
+		kfree(t);
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->gc.set = set;
+	for (i = 0; i < ahash_numof_locks(hbits); i++)
+		spin_lock_init(&t->hregion[i].lock);
 	h->maxelem = maxelem;
 #ifdef IP_SET_HASH_WITH_NETMASK
 	h->netmask = netmask;
@@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 	get_random_bytes(&h->initval, sizeof(h->initval));
 
 	t->htable_bits = hbits;
+	t->maxelem = h->maxelem / ahash_numof_locks(hbits);
 	RCU_INIT_POINTER(h->table, t);
 
-	h->set = set;
+	INIT_LIST_HEAD(&h->ad);
 	set->data = h;
 #ifndef IP_SET_PROTO_UNDEF
 	if (set->family == NFPROTO_IPV4) {
@@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
 #ifndef IP_SET_PROTO_UNDEF
 		if (set->family == NFPROTO_IPV4)
 #endif
-			IPSET_TOKEN(HTYPE, 4_gc_init)(set,
-				IPSET_TOKEN(HTYPE, 4_gc));
+			IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
 #ifndef IP_SET_PROTO_UNDEF
 		else
-			IPSET_TOKEN(HTYPE, 6_gc_init)(set,
-				IPSET_TOKEN(HTYPE, 6_gc));
+			IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
 #endif
 	}
 	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",

From 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Sat, 22 Feb 2020 12:01:43 +0100
Subject: [PATCH 156/239] netfilter: ipset: Fix forceadd evaluation path

When the forceadd option is enabled, the hash:* types should find and replace
the first entry in the bucket with the new one if there are no reuseable
(deleted or timed out) entries. However, the position index was just not set
to zero and remained the invalid -1 if there were no reuseable entries.

Reported-by: syzbot+6a86565c74ebe30aea18@syzkaller.appspotmail.com
Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7")
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_gen.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 71e93eac0831..e52d7b7597a0 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -931,6 +931,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		}
 	}
 	if (reuse || forceadd) {
+		if (j == -1)
+			j = 0;
 		data = ahash_data(n, j, set->dsize);
 		if (!deleted) {
 #ifdef IP_SET_HASH_WITH_NETS

From c7849be9cc2dd2754c48ddbaca27c2de6d80a95d Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Sat, 22 Feb 2020 14:46:05 +0800
Subject: [PATCH 157/239] io_uring: fix __io_iopoll_check deadlock in
 io_sq_thread

Since commit a3a0e43fd770 ("io_uring: don't enter poll loop if we have
CQEs pending"), if we already events pending, we won't enter poll loop.
In case SETUP_IOPOLL and SETUP_SQPOLL are both enabled, if app has
been terminated and don't reap pending events which are already in cq
ring, and there are some reqs in poll_list, io_sq_thread will enter
__io_iopoll_check(), and find pending events, then return, this loop
will never have a chance to exit.

I have seen this issue in fio stress tests, to fix this issue, let
io_sq_thread call io_iopoll_getevents() with argument 'min' being zero,
and remove __io_iopoll_check().

Fixes: a3a0e43fd770 ("io_uring: don't enter poll loop if we have CQEs pending")
Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b43467b3a8dc..de650df9ac53 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1672,11 +1672,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
-			    long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+			   long min)
 {
 	int iters = 0, ret = 0;
 
+	/*
+	 * We disallow the app entering submit/complete with polling, but we
+	 * still need to lock the ring to prevent racing with polled issue
+	 * that got punted to a workqueue.
+	 */
+	mutex_lock(&ctx->uring_lock);
 	do {
 		int tmin = 0;
 
@@ -1712,21 +1718,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		ret = 0;
 	} while (min && !*nr_events && !need_resched());
 
-	return ret;
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
-			   long min)
-{
-	int ret;
-
-	/*
-	 * We disallow the app entering submit/complete with polling, but we
-	 * still need to lock the ring to prevent racing with polled issue
-	 * that got punted to a workqueue.
-	 */
-	mutex_lock(&ctx->uring_lock);
-	ret = __io_iopoll_check(ctx, nr_events, min);
 	mutex_unlock(&ctx->uring_lock);
 	return ret;
 }
@@ -5118,7 +5109,7 @@ static int io_sq_thread(void *data)
 				 */
 				mutex_lock(&ctx->uring_lock);
 				if (!list_empty(&ctx->poll_list))
-					__io_iopoll_check(ctx, &nr_events, 0);
+					io_iopoll_getevents(ctx, &nr_events, 0);
 				else
 					inflight = 0;
 				mutex_unlock(&ctx->uring_lock);

From 2ad3e17ebf94b7b7f3f64c050ff168f9915345eb Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Sat, 22 Feb 2020 20:36:47 -0500
Subject: [PATCH 158/239] audit: fix error handling in audit_data_to_entry()

Commit 219ca39427bf ("audit: use union for audit_field values since
they are mutually exclusive") combined a number of separate fields in
the audit_field struct into a single union.  Generally this worked
just fine because they are generally mutually exclusive.
Unfortunately in audit_data_to_entry() the overlap can be a problem
when a specific error case is triggered that causes the error path
code to attempt to cleanup an audit_field struct and the cleanup
involves attempting to free a stored LSM string (the lsm_str field).
Currently the code always has a non-NULL value in the
audit_field.lsm_str field as the top of the for-loop transfers a
value into audit_field.val (both .lsm_str and .val are part of the
same union); if audit_data_to_entry() fails and the audit_field
struct is specified to contain a LSM string, but the
audit_field.lsm_str has not yet been properly set, the error handling
code will attempt to free the bogus audit_field.lsm_str value that
was set with audit_field.val at the top of the for-loop.

This patch corrects this by ensuring that the audit_field.val is only
set when needed (it is cleared when the audit_field struct is
allocated with kcalloc()).  It also corrects a few other issues to
ensure that in case of error the proper error code is returned.

Cc: stable@vger.kernel.org
Fixes: 219ca39427bf ("audit: use union for audit_field values since they are mutually exclusive")
Reported-by: syzbot+1f4d90ead370d72e450b@syzkaller.appspotmail.com
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditfilter.c | 81 ++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b0126e9c0743..026e34da4ace 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 	bufp = data->buf;
 	for (i = 0; i < data->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
+		u32 f_val;
 
 		err = -EINVAL;
 
@@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			goto exit_free;
 
 		f->type = data->fields[i];
-		f->val = data->values[i];
+		f_val = data->values[i];
 
 		/* Support legacy tests for a valid loginuid */
-		if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
+		if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
 			f->type = AUDIT_LOGINUID_SET;
-			f->val = 0;
+			f_val = 0;
 			entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
 		}
 
@@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_SUID:
 		case AUDIT_FSUID:
 		case AUDIT_OBJ_UID:
-			f->uid = make_kuid(current_user_ns(), f->val);
+			f->uid = make_kuid(current_user_ns(), f_val);
 			if (!uid_valid(f->uid))
 				goto exit_free;
 			break;
@@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_SGID:
 		case AUDIT_FSGID:
 		case AUDIT_OBJ_GID:
-			f->gid = make_kgid(current_user_ns(), f->val);
+			f->gid = make_kgid(current_user_ns(), f_val);
 			if (!gid_valid(f->gid))
 				goto exit_free;
 			break;
 		case AUDIT_ARCH:
+			f->val = f_val;
 			entry->rule.arch_f = f;
 			break;
 		case AUDIT_SUBJ_USER:
@@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		case AUDIT_OBJ_TYPE:
 		case AUDIT_OBJ_LEV_LOW:
 		case AUDIT_OBJ_LEV_HIGH:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
+			}
+			entry->rule.buflen += f_val;
+			f->lsm_str = str;
 			err = security_audit_rule_init(f->type, f->op, str,
 						       (void **)&f->lsm_rule);
 			/* Keep currently invalid fields around in case they
@@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 				pr_warn("audit rule for LSM \'%s\' is invalid\n",
 					str);
 				err = 0;
-			}
-			if (err) {
-				kfree(str);
+			} else if (err)
 				goto exit_free;
-			} else
-				f->lsm_str = str;
 			break;
 		case AUDIT_WATCH:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
-			err = audit_to_watch(&entry->rule, str, f->val, f->op);
+			}
+			err = audit_to_watch(&entry->rule, str, f_val, f->op);
 			if (err) {
 				kfree(str);
 				goto exit_free;
 			}
+			entry->rule.buflen += f_val;
 			break;
 		case AUDIT_DIR:
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
 				goto exit_free;
-			entry->rule.buflen += f->val;
-
+			}
 			err = audit_make_tree(&entry->rule, str, f->op);
 			kfree(str);
 			if (err)
 				goto exit_free;
+			entry->rule.buflen += f_val;
 			break;
 		case AUDIT_INODE:
+			f->val = f_val;
 			err = audit_to_inode(&entry->rule, f);
 			if (err)
 				goto exit_free;
 			break;
 		case AUDIT_FILTERKEY:
-			if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
+			if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
 				goto exit_free;
-			str = audit_unpack_string(&bufp, &remain, f->val);
-			if (IS_ERR(str))
-				goto exit_free;
-			entry->rule.buflen += f->val;
-			entry->rule.filterkey = str;
-			break;
-		case AUDIT_EXE:
-			if (entry->rule.exe || f->val > PATH_MAX)
-				goto exit_free;
-			str = audit_unpack_string(&bufp, &remain, f->val);
+			str = audit_unpack_string(&bufp, &remain, f_val);
 			if (IS_ERR(str)) {
 				err = PTR_ERR(str);
 				goto exit_free;
 			}
-			entry->rule.buflen += f->val;
-
-			audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+			entry->rule.buflen += f_val;
+			entry->rule.filterkey = str;
+			break;
+		case AUDIT_EXE:
+			if (entry->rule.exe || f_val > PATH_MAX)
+				goto exit_free;
+			str = audit_unpack_string(&bufp, &remain, f_val);
+			if (IS_ERR(str)) {
+				err = PTR_ERR(str);
+				goto exit_free;
+			}
+			audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
 			if (IS_ERR(audit_mark)) {
 				kfree(str);
 				err = PTR_ERR(audit_mark);
 				goto exit_free;
 			}
+			entry->rule.buflen += f_val;
 			entry->rule.exe = audit_mark;
 			break;
+		default:
+			f->val = f_val;
+			break;
 		}
 	}
 

From 99db590b083fa2bc60adfcb5c839a62db4ef1d79 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 12 Feb 2020 11:10:58 +0100
Subject: [PATCH 159/239] csky: Replace <linux/clk-provider.h> by
 <linux/of_clk.h>

The C-Sky platform code is not a clock provider, and just needs to call
of_clk_init().

Hence it can include <linux/of_clk.h> instead of <linux/clk-provider.h>.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
---
 arch/csky/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/csky/kernel/time.c b/arch/csky/kernel/time.c
index b5fc9447d93f..52379d866fe4 100644
--- a/arch/csky/kernel/time.c
+++ b/arch/csky/kernel/time.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 // Copyright (C) 2018 Hangzhou C-SKY Microsystems co.,ltd.
 
-#include <linux/clk-provider.h>
 #include <linux/clocksource.h>
+#include <linux/of_clk.h>
 
 void __init time_init(void)
 {

From 42d84c8490f9f0931786f1623191fcab397c3d64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma@redhat.com>
Date: Fri, 21 Feb 2020 12:06:56 +0100
Subject: [PATCH 160/239] vhost: Check docket sk_family instead of call getname
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Doing so, we save one call to get data we already have in the struct.

Also, since there is no guarantee that getname use sockaddr_ll
parameter beyond its size, we add a little bit of security here.
It should do not do beyond MAX_ADDR_LEN, but syzbot found that
ax25_getname writes more (72 bytes, the size of full_sockaddr_ax25,
versus 20 + 32 bytes of sockaddr_ll + MAX_ADDR_LEN in syzbot repro).

Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
Reported-by: syzbot+f2a62d07a5198c819c7b@syzkaller.appspotmail.com
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/net.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e158159671fa..18e205eeb9af 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1414,10 +1414,6 @@ static int vhost_net_release(struct inode *inode, struct file *f)
 
 static struct socket *get_raw_socket(int fd)
 {
-	struct {
-		struct sockaddr_ll sa;
-		char  buf[MAX_ADDR_LEN];
-	} uaddr;
 	int r;
 	struct socket *sock = sockfd_lookup(fd, &r);
 
@@ -1430,11 +1426,7 @@ static struct socket *get_raw_socket(int fd)
 		goto err;
 	}
 
-	r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0);
-	if (r < 0)
-		goto err;
-
-	if (uaddr.sa.sll_family != AF_PACKET) {
+	if (sock->sk->sk_family != AF_PACKET) {
 		r = -EPFNOSUPPORT;
 		goto err;
 	}

From 3e72dfdf8227b052393f71d820ec7599909dddc2 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Fri, 21 Feb 2020 12:28:38 +0100
Subject: [PATCH 161/239] ipv4: ensure rcu_read_lock() in cipso_v4_error()

Similarly to commit c543cb4a5f07 ("ipv4: ensure rcu_read_lock() in
ipv4_link_failure()"), __ip_options_compile() must be called under rcu
protection.

Fixes: 3da1ed7ac398 ("net: avoid use IPCB in cipso_v4_error")
Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/cipso_ipv4.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 376882215919..0bd10a1f477f 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1724,6 +1724,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
 {
 	unsigned char optbuf[sizeof(struct ip_options) + 40];
 	struct ip_options *opt = (struct ip_options *)optbuf;
+	int res;
 
 	if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
 		return;
@@ -1735,7 +1736,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
 
 	memset(opt, 0, sizeof(struct ip_options));
 	opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
-	if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL))
+	rcu_read_lock();
+	res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+	rcu_read_unlock();
+
+	if (res)
 		return;
 
 	if (gateway)

From 39f3b41aa7cae917f928ef9f31d09da28188e5ed Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Feb 2020 19:42:13 +0100
Subject: [PATCH 162/239] net: genetlink: return the error code when attribute
 parsing fails.

Currently if attribute parsing fails and the genl family
does not support parallel operation, the error code returned
by __nlmsg_parse() is discarded by genl_family_rcv_msg_attrs_parse().

Be sure to report the error for all genl families.

Fixes: c10e6cf85e7d ("net: genetlink: push attrbuf allocation and parsing to a separate function")
Fixes: ab5b526da048 ("net: genetlink: always allocate separate attrs for dumpit ops")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/genetlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0522b2b1fd95..9f357aa22b94 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -497,8 +497,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
 
 	err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr,
 			    family->policy, validate, extack);
-	if (err && parallel) {
-		kfree(attrbuf);
+	if (err) {
+		if (parallel)
+			kfree(attrbuf);
 		return ERR_PTR(err);
 	}
 	return attrbuf;

From dd58f3c95c98e6e2cf30d9e562cae0503c5f2713 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Sun, 23 Feb 2020 16:13:12 +0800
Subject: [PATCH 163/239] KVM: fix error handling in svm_hardware_setup

rename svm_hardware_unsetup as svm_hardware_teardown, move
it before svm_hardware_setup, and call it to free all memory
if fail to setup in svm_hardware_setup, otherwise memory will
be leaked

remove __exit attribute for it since it is called in __init
function

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff02aeb23616..d9b5add5a211 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1349,6 +1349,24 @@ static __init void svm_adjust_mmio_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
 }
 
+static void svm_hardware_teardown(void)
+{
+	int cpu;
+
+	if (svm_sev_enabled()) {
+		bitmap_free(sev_asid_bitmap);
+		bitmap_free(sev_reclaim_asid_bitmap);
+
+		sev_flush_asids();
+	}
+
+	for_each_possible_cpu(cpu)
+		svm_cpu_uninit(cpu);
+
+	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+	iopm_base = 0;
+}
+
 static __init int svm_hardware_setup(void)
 {
 	int cpu;
@@ -1462,29 +1480,10 @@ static __init int svm_hardware_setup(void)
 	return 0;
 
 err:
-	__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
-	iopm_base = 0;
+	svm_hardware_teardown();
 	return r;
 }
 
-static __exit void svm_hardware_unsetup(void)
-{
-	int cpu;
-
-	if (svm_sev_enabled()) {
-		bitmap_free(sev_asid_bitmap);
-		bitmap_free(sev_reclaim_asid_bitmap);
-
-		sev_flush_asids();
-	}
-
-	for_each_possible_cpu(cpu)
-		svm_cpu_uninit(cpu);
-
-	__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
-	iopm_base = 0;
-}
-
 static void init_seg(struct vmcb_seg *seg)
 {
 	seg->selector = 0;
@@ -7385,7 +7384,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
 	.hardware_setup = svm_hardware_setup,
-	.hardware_unsetup = svm_hardware_unsetup,
+	.hardware_unsetup = svm_hardware_teardown,
 	.check_processor_compatibility = svm_check_processor_compat,
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,

From 5ef8acbdd687c9d72582e2c05c0b9756efb37863 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Fri, 7 Feb 2020 02:36:07 -0800
Subject: [PATCH 164/239] KVM: nVMX: Emulate MTF when performing instruction
 emulation

Since commit 5f3d45e7f282 ("kvm/x86: add support for
MONITOR_TRAP_FLAG"), KVM has allowed an L1 guest to use the monitor trap
flag processor-based execution control for its L2 guest. KVM simply
forwards any MTF VM-exits to the L1 guest, which works for normal
instruction execution.

However, when KVM needs to emulate an instruction on the behalf of an L2
guest, the monitor trap flag is not emulated. Add the necessary logic to
kvm_skip_emulated_instruction() to synthesize an MTF VM-exit to L1 upon
instruction emulation for L2.

Fixes: 5f3d45e7f282 ("kvm/x86: add support for MONITOR_TRAP_FLAG")
Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/include/uapi/asm/kvm.h |  1 +
 arch/x86/kvm/svm.c              |  1 +
 arch/x86/kvm/vmx/nested.c       | 35 ++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/nested.h       |  5 +++++
 arch/x86/kvm/vmx/vmx.c          | 37 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/vmx/vmx.h          |  3 +++
 arch/x86/kvm/x86.c              |  2 ++
 8 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a84e8c5acda8..98959e8cd448 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1122,6 +1122,7 @@ struct kvm_x86_ops {
 	int (*handle_exit)(struct kvm_vcpu *vcpu,
 		enum exit_fastpath_completion exit_fastpath);
 	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*update_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da16..3f3f780c8c65 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -390,6 +390,7 @@ struct kvm_sync_regs {
 #define KVM_STATE_NESTED_GUEST_MODE	0x00000001
 #define KVM_STATE_NESTED_RUN_PENDING	0x00000002
 #define KVM_STATE_NESTED_EVMCS		0x00000004
+#define KVM_STATE_NESTED_MTF_PENDING	0x00000008
 
 #define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
 #define KVM_STATE_NESTED_SMM_VMXON	0x00000002
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d9b5add5a211..ad3f5b178a03 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7439,6 +7439,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.update_emulated_instruction = NULL,
 	.set_interrupt_shadow = svm_set_interrupt_shadow,
 	.get_interrupt_shadow = svm_get_interrupt_shadow,
 	.patch_hypercall = svm_patch_hypercall,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2b3ba7d27be4..50d8dbb3616d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3609,8 +3609,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 	unsigned long exit_qual;
 	bool block_nested_events =
 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+	bool mtf_pending = vmx->nested.mtf_pending;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
+	/*
+	 * Clear the MTF state. If a higher priority VM-exit is delivered first,
+	 * this state is discarded.
+	 */
+	vmx->nested.mtf_pending = false;
+
 	if (lapic_in_kernel(vcpu) &&
 		test_bit(KVM_APIC_INIT, &apic->pending_events)) {
 		if (block_nested_events)
@@ -3621,8 +3628,28 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		return 0;
 	}
 
+	/*
+	 * Process any exceptions that are not debug traps before MTF.
+	 */
 	if (vcpu->arch.exception.pending &&
-		nested_vmx_check_exception(vcpu, &exit_qual)) {
+	    !vmx_pending_dbg_trap(vcpu) &&
+	    nested_vmx_check_exception(vcpu, &exit_qual)) {
+		if (block_nested_events)
+			return -EBUSY;
+		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+		return 0;
+	}
+
+	if (mtf_pending) {
+		if (block_nested_events)
+			return -EBUSY;
+		nested_vmx_update_pending_dbg(vcpu);
+		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+		return 0;
+	}
+
+	if (vcpu->arch.exception.pending &&
+	    nested_vmx_check_exception(vcpu, &exit_qual)) {
 		if (block_nested_events)
 			return -EBUSY;
 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
@@ -5712,6 +5739,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
 
 			if (vmx->nested.nested_run_pending)
 				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+
+			if (vmx->nested.mtf_pending)
+				kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
 		}
 	}
 
@@ -5892,6 +5922,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
 	vmx->nested.nested_run_pending =
 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
 
+	vmx->nested.mtf_pending =
+		!!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
+
 	ret = -EINVAL;
 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
 	    vmcs12->vmcs_link_pointer != -1ull) {
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 1c5fbff45d69..1db388f2a444 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -174,6 +174,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+}
+
 static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
 {
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 404dafedd778..dcca514ffd42 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1603,6 +1603,40 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!is_guest_mode(vcpu))
+		return;
+
+	/*
+	 * Per the SDM, MTF takes priority over debug-trap exceptions besides
+	 * T-bit traps. As instruction emulation is completed (i.e. at the
+	 * instruction boundary), any #DB exception pending delivery must be a
+	 * debug-trap. Record the pending MTF state to be delivered in
+	 * vmx_check_nested_events().
+	 */
+	if (nested_cpu_has_mtf(vmcs12) &&
+	    (!vcpu->arch.exception.pending ||
+	     vcpu->arch.exception.nr == DB_VECTOR))
+		vmx->nested.mtf_pending = true;
+	else
+		vmx->nested.mtf_pending = false;
+}
+
+static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+	vmx_update_emulated_instruction(vcpu);
+	return skip_emulated_instruction(vcpu);
+}
+
 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -7796,7 +7830,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
-	.skip_emulated_instruction = skip_emulated_instruction,
+	.skip_emulated_instruction = vmx_skip_emulated_instruction,
+	.update_emulated_instruction = vmx_update_emulated_instruction,
 	.set_interrupt_shadow = vmx_set_interrupt_shadow,
 	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 7f42cf3dcd70..e64da06c7009 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -150,6 +150,9 @@ struct nested_vmx {
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
 
+	/* Pending MTF VM-exit into L1.  */
+	bool mtf_pending;
+
 	struct loaded_vmcs vmcs02;
 
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fb5d64ebc35d..359fcd395132 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6891,6 +6891,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 			kvm_rip_write(vcpu, ctxt->eip);
 			if (r && ctxt->tf)
 				r = kvm_vcpu_do_singlestep(vcpu);
+			if (kvm_x86_ops->update_emulated_instruction)
+				kvm_x86_ops->update_emulated_instruction(vcpu);
 			__kvm_set_rflags(vcpu, ctxt->eflags);
 		}
 

From 07721feee46b4b248402133228235318199b05ec Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 4 Feb 2020 15:26:29 -0800
Subject: [PATCH 165/239] KVM: nVMX: Don't emulate instructions in guest mode

vmx_check_intercept is not yet fully implemented. To avoid emulating
instructions disallowed by the L1 hypervisor, refuse to emulate
instructions by default.

Cc: stable@vger.kernel.org
[Made commit, added commit msg - Oliver]
Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index dcca514ffd42..5801a86f9c24 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7164,7 +7164,7 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 	}
 
 	/* TODO: check more intercepts... */
-	return X86EMUL_CONTINUE;
+	return X86EMUL_UNHANDLEABLE;
 }
 
 #ifdef CONFIG_X86_64

From e71237d3ff1abf9f3388337cfebf53b96df2020d Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Tue, 4 Feb 2020 15:26:30 -0800
Subject: [PATCH 166/239] KVM: nVMX: Refactor IO bitmap checks into helper
 function

Checks against the IO bitmap are useful for both instruction emulation
and VM-exit reflection. Refactor the IO bitmap checks into a helper
function.

Signed-off-by: Oliver Upton <oupton@google.com>
Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c | 39 +++++++++++++++++++++++++--------------
 arch/x86/kvm/vmx/nested.h |  2 ++
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 50d8dbb3616d..f979832c394d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5312,24 +5312,17 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
-				       struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+				 int size)
 {
-	unsigned long exit_qualification;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	gpa_t bitmap, last_bitmap;
-	unsigned int port;
-	int size;
 	u8 b;
 
-	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
-		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
-	port = exit_qualification >> 16;
-	size = (exit_qualification & 7) + 1;
-
 	last_bitmap = (gpa_t)-1;
 	b = -1;
 
@@ -5356,6 +5349,24 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 	return false;
 }
 
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+				       struct vmcs12 *vmcs12)
+{
+	unsigned long exit_qualification;
+	unsigned int port;
+	int size;
+
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
+
+	return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
 /*
  * Return 1 if we should exit from L2 to L1 to handle an MSR access,
  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 1db388f2a444..9aeda46f473e 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -33,6 +33,8 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
 			u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+				 int size);
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {

From 35a571346a94fb93b5b3b6a599675ef3384bc75c Mon Sep 17 00:00:00 2001
From: Oliver Upton <oupton@google.com>
Date: Tue, 4 Feb 2020 15:26:31 -0800
Subject: [PATCH 167/239] KVM: nVMX: Check IO instruction VM-exit conditions

Consult the 'unconditional IO exiting' and 'use IO bitmaps' VM-execution
controls when checking instruction interception. If the 'use IO bitmaps'
VM-execution control is 1, check the instruction access against the IO
bitmaps to determine if the instruction causes a VM-exit.

Signed-off-by: Oliver Upton <oupton@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/vmx/vmx.c    | 57 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f979832c394d..e920d7834d73 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5353,7 +5353,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 				       struct vmcs12 *vmcs12)
 {
 	unsigned long exit_qualification;
-	unsigned int port;
+	unsigned short port;
 	int size;
 
 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5801a86f9c24..63aaf44edd1f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7145,6 +7145,39 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
 	to_vmx(vcpu)->req_immediate_exit = true;
 }
 
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+				  struct x86_instruction_info *info)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+	unsigned short port;
+	bool intercept;
+	int size;
+
+	if (info->intercept == x86_intercept_in ||
+	    info->intercept == x86_intercept_ins) {
+		port = info->src_val;
+		size = info->dst_bytes;
+	} else {
+		port = info->dst_val;
+		size = info->src_bytes;
+	}
+
+	/*
+	 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+	 * VM-exits depend on the 'unconditional IO exiting' VM-execution
+	 * control.
+	 *
+	 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+	 */
+	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+		intercept = nested_cpu_has(vmcs12,
+					   CPU_BASED_UNCOND_IO_EXITING);
+	else
+		intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage)
@@ -7152,18 +7185,30 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 
+	switch (info->intercept) {
 	/*
 	 * RDPID causes #UD if disabled through secondary execution controls.
 	 * Because it is marked as EmulateOnUD, we need to intercept it here.
 	 */
-	if (info->intercept == x86_intercept_rdtscp &&
-	    !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
-		ctxt->exception.vector = UD_VECTOR;
-		ctxt->exception.error_code_valid = false;
-		return X86EMUL_PROPAGATE_FAULT;
-	}
+	case x86_intercept_rdtscp:
+		if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+			ctxt->exception.vector = UD_VECTOR;
+			ctxt->exception.error_code_valid = false;
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+		break;
+
+	case x86_intercept_in:
+	case x86_intercept_ins:
+	case x86_intercept_out:
+	case x86_intercept_outs:
+		return vmx_check_intercept_io(vcpu, info);
 
 	/* TODO: check more intercepts... */
+	default:
+		break;
+	}
+
 	return X86EMUL_UNHANDLEABLE;
 }
 

From eae7172f8141eb98e64e6e81acc9e9d5b2add127 Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Fri, 21 Feb 2020 14:17:05 +0100
Subject: [PATCH 168/239] net: usb: qmi_wwan: restore mtu min/max values after
 raw_ip switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

usbnet creates network interfaces with min_mtu = 0 and
max_mtu = ETH_MAX_MTU.

These values are not modified by qmi_wwan when the network interface
is created initially, allowing, for example, to set mtu greater than 1500.

When a raw_ip switch is done (raw_ip set to 'Y', then set to 'N') the mtu
values for the network interface are set through ether_setup, with
min_mtu = ETH_MIN_MTU and max_mtu = ETH_DATA_LEN, not allowing anymore to
set mtu greater than 1500 (error: mtu greater than device maximum).

The patch restores the original min/max mtu values set by usbnet after a
raw_ip switch.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 3b7a3b8a5e06..5754bb6ca0ee 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -337,6 +337,9 @@ static void qmi_wwan_netdev_setup(struct net_device *net)
 		netdev_dbg(net, "mode: raw IP\n");
 	} else if (!net->header_ops) { /* don't bother if already set */
 		ether_setup(net);
+		/* Restoring min/max mtu values set originally by usbnet */
+		net->min_mtu = 0;
+		net->max_mtu = ETH_MAX_MTU;
 		clear_bit(EVENT_NO_IP_ALIGN, &dev->flags);
 		netdev_dbg(net, "mode: Ethernet\n");
 	}

From f8788d86ab28f61f7b46eb6be375f8a726783636 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 23 Feb 2020 16:17:42 -0800
Subject: [PATCH 169/239] Linux 5.6-rc3

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index aab38cb02b24..0914049d2929 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 5
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -rc2
+EXTRAVERSION = -rc3
 NAME = Kleptomaniac Octopus
 
 # *DOCUMENTATION*

From f6f13c125e05603f68f5bf31f045b95e6d493598 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Fri, 21 Feb 2020 08:32:18 -0800
Subject: [PATCH 170/239] hv_netvsc: Fix unwanted wakeup in netvsc_attach()

When netvsc_attach() is called by operations like changing MTU, etc.,
an extra wakeup may happen while netvsc_attach() calling
rndis_filter_device_add() which sends rndis messages when queue is
stopped in netvsc_detach(). The completion message will wake up queue 0.

We can reproduce the issue by changing MTU etc., then the wake_queue
counter from "ethtool -S" will increase beyond stop_queue counter:
     stop_queue: 0
     wake_queue: 1
The issue causes queue wake up, and counter increment, no other ill
effects in current code. So we didn't see any network problem for now.

To fix this, initialize tx_disable to true, and set it to false when
the NIC is ready to be attached or registered.

Fixes: 7b2ee50c0cd5 ("hv_netvsc: common detach logic")
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c     | 2 +-
 drivers/net/hyperv/netvsc_drv.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index ae3f3084c2ed..1b320bcf150a 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -99,7 +99,7 @@ static struct netvsc_device *alloc_net_device(void)
 
 	init_waitqueue_head(&net_device->wait_drain);
 	net_device->destroy = false;
-	net_device->tx_disable = false;
+	net_device->tx_disable = true;
 
 	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
 	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 65e12cb07f45..2c0a24c606fc 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1068,6 +1068,7 @@ static int netvsc_attach(struct net_device *ndev,
 	}
 
 	/* In any case device is now ready */
+	nvdev->tx_disable = false;
 	netif_device_attach(ndev);
 
 	/* Note: enable and attach happen when sub-channels setup */
@@ -2476,6 +2477,8 @@ static int netvsc_probe(struct hv_device *dev,
 	else
 		net->max_mtu = ETH_DATA_LEN;
 
+	nvdev->tx_disable = false;
+
 	ret = register_netdevice(net);
 	if (ret != 0) {
 		pr_err("Unable to register netdev.\n");

From dad8cea7add96a353fa1898b5ccefbb72da66f29 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Feb 2020 11:21:15 -0500
Subject: [PATCH 171/239] tcp: fix TFO SYNACK undo to avoid
 double-timestamp-undo

In a rare corner case the new logic for undo of SYNACK RTO could
result in triggering the warning in tcp_fastretrans_alert() that says:
        WARN_ON(tp->retrans_out != 0);

The warning looked like:

WARNING: CPU: 1 PID: 1 at net/ipv4/tcp_input.c:2818 tcp_ack+0x13e0/0x3270

The sequence that tickles this bug is:
 - Fast Open server receives TFO SYN with data, sends SYNACK
 - (client receives SYNACK and sends ACK, but ACK is lost)
 - server app sends some data packets
 - (N of the first data packets are lost)
 - server receives client ACK that has a TS ECR matching first SYNACK,
   and also SACKs suggesting the first N data packets were lost
    - server performs TS undo of SYNACK RTO, then immediately
      enters recovery
    - buggy behavior then performed a *second* undo that caused
      the connection to be in CA_Open with retrans_out != 0

Basically, the incoming ACK packet with SACK blocks causes us to first
undo the cwnd reduction from the SYNACK RTO, but then immediately
enters fast recovery, which then makes us eligible for undo again. And
then tcp_rcv_synrecv_state_fastopen() accidentally performs an undo
using a "mash-up" of state from two different loss recovery phases: it
uses the timestamp info from the ACK of the original SYNACK, and the
undo_marker from the fast recovery.

This fix refines the logic to only invoke the tcp_try_undo_loss()
inside tcp_rcv_synrecv_state_fastopen() if the connection is still in
CA_Loss.  If peer SACKs triggered fast recovery, then
tcp_rcv_synrecv_state_fastopen() can't safely undo.

Fixes: 794200d66273 ("tcp: undo cwnd on Fast Open spurious SYNACK retransmit")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 316ebdf8151d..6b6b57000dad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6124,7 +6124,11 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
 {
 	struct request_sock *req;
 
-	tcp_try_undo_loss(sk, false);
+	/* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+	 * undo. If peer SACKs triggered fast recovery, we can't undo here.
+	 */
+	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
+		tcp_try_undo_loss(sk, false);
 
 	/* Reset rtx states to prevent spurious retransmits_timed_out() */
 	tcp_sk(sk)->retrans_stamp = 0;

From 52df1e564eb0470b2ecd1481e457932f09f49f5d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= <j.neuschaefer@gmx.net>
Date: Sun, 23 Feb 2020 18:46:31 +0100
Subject: [PATCH 172/239] docs: networking: phy: Rephrase paragraph for clarity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's make it a little easier to read.

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/phy.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 1e4735cc0553..256106054c8c 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -487,8 +487,9 @@ phy_register_fixup_for_id()::
 The stubs set one of the two matching criteria, and set the other one to
 match anything.
 
-When phy_register_fixup() or \*_for_uid()/\*_for_id() is called at module,
-unregister fixup and free allocate memory are required.
+When phy_register_fixup() or \*_for_uid()/\*_for_id() is called at module load
+time, the module needs to unregister the fixup and free allocated memory when
+it's unloaded.
 
 Call one of following function before unloading module::
 

From 44343418d0f2f623cb9da6f5000df793131cbe3b Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Sun, 23 Feb 2020 14:38:40 +0100
Subject: [PATCH 173/239] net: ks8851-ml: Fix IRQ handling and locking

The KS8851 requires that packet RX and TX are mutually exclusive.
Currently, the driver hopes to achieve this by disabling interrupt
from the card by writing the card registers and by disabling the
interrupt on the interrupt controller. This however is racy on SMP.

Replace this approach by expanding the spinlock used around the
ks_start_xmit() TX path to ks_irq() RX path to assure true mutual
exclusion and remove the interrupt enabling/disabling, which is
now not needed anymore. Furthermore, disable interrupts also in
ks_net_stop(), which was missing before.

Note that a massive improvement here would be to re-use the KS8851
driver approach, which is to move the TX path into a worker thread,
interrupt handling to threaded interrupt, and synchronize everything
with mutexes, but that would be a much bigger rework, for a separate
patch.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Petr Stetiar <ynezz@true.cz>
Cc: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/micrel/ks8851_mll.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c
index 1c9e70c8cc30..58579baf3f7a 100644
--- a/drivers/net/ethernet/micrel/ks8851_mll.c
+++ b/drivers/net/ethernet/micrel/ks8851_mll.c
@@ -513,14 +513,17 @@ static irqreturn_t ks_irq(int irq, void *pw)
 {
 	struct net_device *netdev = pw;
 	struct ks_net *ks = netdev_priv(netdev);
+	unsigned long flags;
 	u16 status;
 
+	spin_lock_irqsave(&ks->statelock, flags);
 	/*this should be the first in IRQ handler */
 	ks_save_cmd_reg(ks);
 
 	status = ks_rdreg16(ks, KS_ISR);
 	if (unlikely(!status)) {
 		ks_restore_cmd_reg(ks);
+		spin_unlock_irqrestore(&ks->statelock, flags);
 		return IRQ_NONE;
 	}
 
@@ -546,6 +549,7 @@ static irqreturn_t ks_irq(int irq, void *pw)
 		ks->netdev->stats.rx_over_errors++;
 	/* this should be the last in IRQ handler*/
 	ks_restore_cmd_reg(ks);
+	spin_unlock_irqrestore(&ks->statelock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -615,6 +619,7 @@ static int ks_net_stop(struct net_device *netdev)
 
 	/* shutdown RX/TX QMU */
 	ks_disable_qmu(ks);
+	ks_disable_int(ks);
 
 	/* set powermode to soft power down to save power */
 	ks_set_powermode(ks, PMECR_PM_SOFTDOWN);
@@ -671,10 +676,9 @@ static netdev_tx_t ks_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
 	netdev_tx_t retv = NETDEV_TX_OK;
 	struct ks_net *ks = netdev_priv(netdev);
+	unsigned long flags;
 
-	disable_irq(netdev->irq);
-	ks_disable_int(ks);
-	spin_lock(&ks->statelock);
+	spin_lock_irqsave(&ks->statelock, flags);
 
 	/* Extra space are required:
 	*  4 byte for alignment, 4 for status/length, 4 for CRC
@@ -688,9 +692,7 @@ static netdev_tx_t ks_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 		dev_kfree_skb(skb);
 	} else
 		retv = NETDEV_TX_BUSY;
-	spin_unlock(&ks->statelock);
-	ks_enable_int(ks);
-	enable_irq(netdev->irq);
+	spin_unlock_irqrestore(&ks->statelock, flags);
 	return retv;
 }
 

From 503ba7c6961034ff0047707685644cad9287c226 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 20 Feb 2020 15:34:53 -0800
Subject: [PATCH 174/239] net: phy: Avoid multiple suspends

It is currently possible for a PHY device to be suspended as part of a
network device driver's suspend call while it is still being attached to
that net_device, either via phy_suspend() or implicitly via phy_stop().

Later on, when the MDIO bus controller get suspended, we would attempt
to suspend again the PHY because it is still attached to a network
device.

This is both a waste of time and creates an opportunity for improper
clock/power management bugs to creep in.

Fixes: 803dd9c77ac3 ("net: phy: avoid suspending twice a PHY")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 6a5056e0ae77..6131aca79823 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -247,7 +247,7 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	 * MDIO bus driver and clock gated at this point.
 	 */
 	if (!netdev)
-		return !phydev->suspended;
+		goto out;
 
 	if (netdev->wol_enabled)
 		return false;
@@ -267,7 +267,8 @@ static bool mdio_bus_phy_may_suspend(struct phy_device *phydev)
 	if (device_may_wakeup(&netdev->dev))
 		return false;
 
-	return true;
+out:
+	return !phydev->suspended;
 }
 
 static int mdio_bus_phy_suspend(struct device *dev)

From 6132c1d9033d158bd0464e90bc46544fbe0bd6bc Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Sun, 23 Feb 2020 16:52:33 +0530
Subject: [PATCH 175/239] net: core: devlink.c: Hold devlink->lock from the
 beginning of devlink_dpipe_table_register()

devlink_dpipe_table_find() should be called under either
rcu_read_lock() or devlink->lock. devlink_dpipe_table_register()
calls devlink_dpipe_table_find() without holding the lock
and acquires it later. Therefore hold the devlink->lock
from the beginning of devlink_dpipe_table_register().

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 549ee56b7a21..8d0b558be942 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6878,26 +6878,33 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 				 void *priv, bool counter_control_extern)
 {
 	struct devlink_dpipe_table *table;
-
-	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
-		return -EEXIST;
+	int err = 0;
 
 	if (WARN_ON(!table_ops->size_get))
 		return -EINVAL;
 
+	mutex_lock(&devlink->lock);
+
+	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) {
+		err = -EEXIST;
+		goto unlock;
+	}
+
 	table = kzalloc(sizeof(*table), GFP_KERNEL);
-	if (!table)
-		return -ENOMEM;
+	if (!table) {
+		err = -ENOMEM;
+		goto unlock;
+	}
 
 	table->name = table_name;
 	table->table_ops = table_ops;
 	table->priv = priv;
 	table->counter_control_extern = counter_control_extern;
 
-	mutex_lock(&devlink->lock);
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+unlock:
 	mutex_unlock(&devlink->lock);
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
 

From e3ae39edbce6dc933fb1393490d1b5d76d3edb90 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Feb 2020 09:38:15 +0100
Subject: [PATCH 176/239] nl80211: explicitly include if_vlan.h

We use that here, and do seem to get it through some recursive
include, but better include it explicitly.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20200224093814.1b9c258fec67.I45ac150d4e11c72eb263abec9f1f0c7add9bef2b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 46be40e19e7f..5b19e9fac4aa 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/nospec.h>
 #include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
 #include <net/net_namespace.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>

From 253216ffb2a002a682c6f68bd3adff5b98b71de8 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Sun, 23 Feb 2020 20:03:02 +0530
Subject: [PATCH 177/239] mac80211: rx: avoid RCU list traversal under mutex

local->sta_mtx is held in __ieee80211_check_fast_rx_iface().
No need to use list_for_each_entry_rcu() as it also requires
a cond argument to avoid false lockdep warnings when not used in
RCU read-side section (with CONFIG_PROVE_RCU_LIST).
Therefore use list_for_each_entry();

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Link: https://lore.kernel.org/r/20200223143302.15390-1-madhuparnabhowmik10@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 0e05ff037672..0ba98ad9bc85 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4114,7 +4114,7 @@ void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata)
 
 	lockdep_assert_held(&local->sta_mtx);
 
-	list_for_each_entry_rcu(sta, &local->sta_list, list) {
+	list_for_each_entry(sta, &local->sta_list, list) {
 		if (sdata != sta->sdata &&
 		    (!sta->sdata->bss || sta->sdata->bss != sdata->bss))
 			continue;

From a93236fcbe1d0248461b29c0f87cb0b510c94e6f Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 24 Feb 2020 11:15:59 +0100
Subject: [PATCH 178/239] KVM: s390: rstify new ioctls in api.rst

We also need to rstify the new ioctls that we added in parallel to the
rstification of the kvm docs.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 97a72a53fa4b..ebd383fba939 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4611,35 +4611,38 @@ unpins the VPA pages and releases all the device pages that are used to
 track the secure pages by hypervisor.
 
 4.122 KVM_S390_NORMAL_RESET
+---------------------------
 
-Capability: KVM_CAP_S390_VCPU_RESETS
-Architectures: s390
-Type: vcpu ioctl
-Parameters: none
-Returns: 0
+:Capability: KVM_CAP_S390_VCPU_RESETS
+:Architectures: s390
+:Type: vcpu ioctl
+:Parameters: none
+:Returns: 0
 
 This ioctl resets VCPU registers and control structures according to
 the cpu reset definition in the POP (Principles Of Operation).
 
 4.123 KVM_S390_INITIAL_RESET
+----------------------------
 
-Capability: none
-Architectures: s390
-Type: vcpu ioctl
-Parameters: none
-Returns: 0
+:Capability: none
+:Architectures: s390
+:Type: vcpu ioctl
+:Parameters: none
+:Returns: 0
 
 This ioctl resets VCPU registers and control structures according to
 the initial cpu reset definition in the POP. However, the cpu is not
 put into ESA mode. This reset is a superset of the normal reset.
 
 4.124 KVM_S390_CLEAR_RESET
+--------------------------
 
-Capability: KVM_CAP_S390_VCPU_RESETS
-Architectures: s390
-Type: vcpu ioctl
-Parameters: none
-Returns: 0
+:Capability: KVM_CAP_S390_VCPU_RESETS
+:Architectures: s390
+:Type: vcpu ioctl
+:Parameters: none
+:Returns: 0
 
 This ioctl resets VCPU registers and control structures according to
 the clear cpu reset definition in the POP. However, the cpu is not put

From 84823ff80f7403752b59e00bb198724100dc611c Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Fri, 21 Feb 2020 07:47:21 +0100
Subject: [PATCH 179/239] net: ll_temac: Fix race condition causing TX hang

It is possible that the interrupt handler fires and frees up space in
the TX ring in between checking for sufficient TX ring space and
stopping the TX queue in temac_start_xmit. If this happens, the
queue wake from the interrupt handler will occur before the queue is
stopped, causing a lost wakeup and the adapter's transmit hanging.

To avoid this, after stopping the queue, check again whether there is
sufficient space in the TX ring. If so, wake up the queue again.

This is a port of the similar fix in axienet driver,
commit 7de44285c1f6 ("net: axienet: Fix race condition causing TX hang").

Fixes: 23ecc4bde21f ("net: ll_temac: fix checksum offload logic")
Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac_main.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 6f11f52c9a9e..996004ef8bd4 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -788,6 +788,9 @@ static void temac_start_xmit_done(struct net_device *ndev)
 		stat = be32_to_cpu(cur_p->app0);
 	}
 
+	/* Matches barrier in temac_start_xmit */
+	smp_mb();
+
 	netif_wake_queue(ndev);
 }
 
@@ -830,9 +833,19 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
 
 	if (temac_check_tx_bd_space(lp, num_frag + 1)) {
-		if (!netif_queue_stopped(ndev))
-			netif_stop_queue(ndev);
-		return NETDEV_TX_BUSY;
+		if (netif_queue_stopped(ndev))
+			return NETDEV_TX_BUSY;
+
+		netif_stop_queue(ndev);
+
+		/* Matches barrier in temac_start_xmit_done */
+		smp_mb();
+
+		/* Space might have just been freed - check again */
+		if (temac_check_tx_bd_space(lp, num_frag))
+			return NETDEV_TX_BUSY;
+
+		netif_wake_queue(ndev);
 	}
 
 	cur_p->app0 = 0;

From d07c849cd2b97d6809430dfb7e738ad31088037a Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Fri, 21 Feb 2020 07:47:33 +0100
Subject: [PATCH 180/239] net: ll_temac: Add more error handling of
 dma_map_single() calls

This adds error handling to the remaining dma_map_single() calls, so that
behavior is well defined if/when we run out of DMA memory.

Fixes: 92744989533c ("net: add Xilinx ll_temac device driver")
Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac_main.c | 26 +++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 996004ef8bd4..c368c3914bda 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -367,6 +367,8 @@ static int temac_dma_bd_init(struct net_device *ndev)
 		skb_dma_addr = dma_map_single(ndev->dev.parent, skb->data,
 					      XTE_MAX_JUMBO_FRAME_SIZE,
 					      DMA_FROM_DEVICE);
+		if (dma_mapping_error(ndev->dev.parent, skb_dma_addr))
+			goto out;
 		lp->rx_bd_v[i].phys = cpu_to_be32(skb_dma_addr);
 		lp->rx_bd_v[i].len = cpu_to_be32(XTE_MAX_JUMBO_FRAME_SIZE);
 		lp->rx_bd_v[i].app0 = cpu_to_be32(STS_CTRL_APP0_IRQONEND);
@@ -863,12 +865,13 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	skb_dma_addr = dma_map_single(ndev->dev.parent, skb->data,
 				      skb_headlen(skb), DMA_TO_DEVICE);
 	cur_p->len = cpu_to_be32(skb_headlen(skb));
+	if (WARN_ON_ONCE(dma_mapping_error(ndev->dev.parent, skb_dma_addr)))
+		return NETDEV_TX_BUSY;
 	cur_p->phys = cpu_to_be32(skb_dma_addr);
 	ptr_to_txbd((void *)skb, cur_p);
 
 	for (ii = 0; ii < num_frag; ii++) {
-		lp->tx_bd_tail++;
-		if (lp->tx_bd_tail >= TX_BD_NUM)
+		if (++lp->tx_bd_tail >= TX_BD_NUM)
 			lp->tx_bd_tail = 0;
 
 		cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
@@ -876,6 +879,25 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 					      skb_frag_address(frag),
 					      skb_frag_size(frag),
 					      DMA_TO_DEVICE);
+		if (dma_mapping_error(ndev->dev.parent, skb_dma_addr)) {
+			if (--lp->tx_bd_tail < 0)
+				lp->tx_bd_tail = TX_BD_NUM - 1;
+			cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
+			while (--ii >= 0) {
+				--frag;
+				dma_unmap_single(ndev->dev.parent,
+						 be32_to_cpu(cur_p->phys),
+						 skb_frag_size(frag),
+						 DMA_TO_DEVICE);
+				if (--lp->tx_bd_tail < 0)
+					lp->tx_bd_tail = TX_BD_NUM - 1;
+				cur_p = &lp->tx_bd_v[lp->tx_bd_tail];
+			}
+			dma_unmap_single(ndev->dev.parent,
+					 be32_to_cpu(cur_p->phys),
+					 skb_headlen(skb), DMA_TO_DEVICE);
+			return NETDEV_TX_BUSY;
+		}
 		cur_p->phys = cpu_to_be32(skb_dma_addr);
 		cur_p->len = cpu_to_be32(skb_frag_size(frag));
 		cur_p->app0 = 0;

From 770d9c67974c4c71af4beb786dc43162ad2a15ba Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Fri, 21 Feb 2020 07:47:45 +0100
Subject: [PATCH 181/239] net: ll_temac: Fix RX buffer descriptor handling on
 GFP_ATOMIC pressure

Failures caused by GFP_ATOMIC memory pressure have been observed, and
due to the missing error handling, results in kernel crash such as

[1876998.350133] kernel BUG at mm/slub.c:3952!
[1876998.350141] invalid opcode: 0000 [#1] PREEMPT SMP PTI
[1876998.350147] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 5.3.0-scnxt #1
[1876998.350150] Hardware name: N/A N/A/COMe-bIP2, BIOS CCR2R920 03/01/2017
[1876998.350160] RIP: 0010:kfree+0x1ca/0x220
[1876998.350164] Code: 85 db 74 49 48 8b 95 68 01 00 00 48 31 c2 48 89 10 e9 d7 fe ff ff 49 8b 04 24 a9 00 00 01 00 75 0b 49 8b 44 24 08 a8 01 75 02 <0f> 0b 49 8b 04 24 31 f6 a9 00 00 01 00 74 06 41 0f b6 74 24
 5b
[1876998.350172] RSP: 0018:ffffc900000f0df0 EFLAGS: 00010246
[1876998.350177] RAX: ffffea00027f0708 RBX: ffff888008d78000 RCX: 0000000000391372
[1876998.350181] RDX: 0000000000000000 RSI: ffffe8ffffd01400 RDI: ffff888008d78000
[1876998.350185] RBP: ffff8881185a5d00 R08: ffffc90000087dd8 R09: 000000000000280a
[1876998.350189] R10: 0000000000000002 R11: 0000000000000000 R12: ffffea0000235e00
[1876998.350193] R13: ffff8881185438a0 R14: 0000000000000000 R15: ffff888118543870
[1876998.350198] FS:  0000000000000000(0000) GS:ffff88811f300000(0000) knlGS:0000000000000000
[1876998.350203] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
s#1 Part1
[1876998.350206] CR2: 00007f8dac7b09f0 CR3: 000000011e20a006 CR4: 00000000001606e0
[1876998.350210] Call Trace:
[1876998.350215]  <IRQ>
[1876998.350224]  ? __netif_receive_skb_core+0x70a/0x920
[1876998.350229]  kfree_skb+0x32/0xb0
[1876998.350234]  __netif_receive_skb_core+0x70a/0x920
[1876998.350240]  __netif_receive_skb_one_core+0x36/0x80
[1876998.350245]  process_backlog+0x8b/0x150
[1876998.350250]  net_rx_action+0xf7/0x340
[1876998.350255]  __do_softirq+0x10f/0x353
[1876998.350262]  irq_exit+0xb2/0xc0
[1876998.350265]  do_IRQ+0x77/0xd0
[1876998.350271]  common_interrupt+0xf/0xf
[1876998.350274]  </IRQ>

In order to handle such failures more graceful, this change splits the
receive loop into one for consuming the received buffers, and one for
allocating new buffers.

When GFP_ATOMIC allocations fail, the receive will continue with the
buffers that is still there, and with the expectation that the allocations
will succeed in a later call to receive.

Fixes: 92744989533c ("net: add Xilinx ll_temac device driver")
Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac.h      |   1 +
 drivers/net/ethernet/xilinx/ll_temac_main.c | 110 ++++++++++++++------
 2 files changed, 81 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h
index 276292bca334..99fe059e5c7f 100644
--- a/drivers/net/ethernet/xilinx/ll_temac.h
+++ b/drivers/net/ethernet/xilinx/ll_temac.h
@@ -375,6 +375,7 @@ struct temac_local {
 	int tx_bd_next;
 	int tx_bd_tail;
 	int rx_bd_ci;
+	int rx_bd_tail;
 
 	/* DMA channel control setup */
 	u32 tx_chnl_ctrl;
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index c368c3914bda..255207f2fd27 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -389,12 +389,13 @@ static int temac_dma_bd_init(struct net_device *ndev)
 	lp->tx_bd_next = 0;
 	lp->tx_bd_tail = 0;
 	lp->rx_bd_ci = 0;
+	lp->rx_bd_tail = RX_BD_NUM - 1;
 
 	/* Enable RX DMA transfers */
 	wmb();
 	lp->dma_out(lp, RX_CURDESC_PTR,  lp->rx_bd_p);
 	lp->dma_out(lp, RX_TAILDESC_PTR,
-		       lp->rx_bd_p + (sizeof(*lp->rx_bd_v) * (RX_BD_NUM - 1)));
+		       lp->rx_bd_p + (sizeof(*lp->rx_bd_v) * lp->rx_bd_tail));
 
 	/* Prepare for TX DMA transfer */
 	lp->dma_out(lp, TX_CURDESC_PTR, lp->tx_bd_p);
@@ -923,27 +924,41 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 static void ll_temac_recv(struct net_device *ndev)
 {
 	struct temac_local *lp = netdev_priv(ndev);
-	struct sk_buff *skb, *new_skb;
-	unsigned int bdstat;
-	struct cdmac_bd *cur_p;
-	dma_addr_t tail_p, skb_dma_addr;
-	int length;
 	unsigned long flags;
+	int rx_bd;
+	bool update_tail = false;
 
 	spin_lock_irqsave(&lp->rx_lock, flags);
 
-	tail_p = lp->rx_bd_p + sizeof(*lp->rx_bd_v) * lp->rx_bd_ci;
-	cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
+	/* Process all received buffers, passing them on network
+	 * stack.  After this, the buffer descriptors will be in an
+	 * un-allocated stage, where no skb is allocated for it, and
+	 * they are therefore not available for TEMAC/DMA.
+	 */
+	do {
+		struct cdmac_bd *bd = &lp->rx_bd_v[lp->rx_bd_ci];
+		struct sk_buff *skb = lp->rx_skb[lp->rx_bd_ci];
+		unsigned int bdstat = be32_to_cpu(bd->app0);
+		int length;
 
-	bdstat = be32_to_cpu(cur_p->app0);
-	while ((bdstat & STS_CTRL_APP0_CMPLT)) {
+		/* While this should not normally happen, we can end
+		 * here when GFP_ATOMIC allocations fail, and we
+		 * therefore have un-allocated buffers.
+		 */
+		if (!skb)
+			break;
 
-		skb = lp->rx_skb[lp->rx_bd_ci];
-		length = be32_to_cpu(cur_p->app4) & 0x3FFF;
+		/* Loop over all completed buffer descriptors */
+		if (!(bdstat & STS_CTRL_APP0_CMPLT))
+			break;
 
-		dma_unmap_single(ndev->dev.parent, be32_to_cpu(cur_p->phys),
+		dma_unmap_single(ndev->dev.parent, be32_to_cpu(bd->phys),
 				 XTE_MAX_JUMBO_FRAME_SIZE, DMA_FROM_DEVICE);
+		/* The buffer is not valid for DMA anymore */
+		bd->phys = 0;
+		bd->len = 0;
 
+		length = be32_to_cpu(bd->app4) & 0x3FFF;
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, ndev);
 		skb_checksum_none_assert(skb);
@@ -958,39 +973,74 @@ static void ll_temac_recv(struct net_device *ndev)
 			 * (back) for proper IP checksum byte order
 			 * (be16).
 			 */
-			skb->csum = htons(be32_to_cpu(cur_p->app3) & 0xFFFF);
+			skb->csum = htons(be32_to_cpu(bd->app3) & 0xFFFF);
 			skb->ip_summed = CHECKSUM_COMPLETE;
 		}
 
 		if (!skb_defer_rx_timestamp(skb))
 			netif_rx(skb);
+		/* The skb buffer is now owned by network stack above */
+		lp->rx_skb[lp->rx_bd_ci] = NULL;
 
 		ndev->stats.rx_packets++;
 		ndev->stats.rx_bytes += length;
 
-		new_skb = netdev_alloc_skb_ip_align(ndev,
-						XTE_MAX_JUMBO_FRAME_SIZE);
-		if (!new_skb) {
-			spin_unlock_irqrestore(&lp->rx_lock, flags);
-			return;
+		rx_bd = lp->rx_bd_ci;
+		if (++lp->rx_bd_ci >= RX_BD_NUM)
+			lp->rx_bd_ci = 0;
+	} while (rx_bd != lp->rx_bd_tail);
+
+	/* Allocate new buffers for those buffer descriptors that were
+	 * passed to network stack.  Note that GFP_ATOMIC allocations
+	 * can fail (e.g. when a larger burst of GFP_ATOMIC
+	 * allocations occurs), so while we try to allocate all
+	 * buffers in the same interrupt where they were processed, we
+	 * continue with what we could get in case of allocation
+	 * failure.  Allocation of remaining buffers will be retried
+	 * in following calls.
+	 */
+	while (1) {
+		struct sk_buff *skb;
+		struct cdmac_bd *bd;
+		dma_addr_t skb_dma_addr;
+
+		rx_bd = lp->rx_bd_tail + 1;
+		if (rx_bd >= RX_BD_NUM)
+			rx_bd = 0;
+		bd = &lp->rx_bd_v[rx_bd];
+
+		if (bd->phys)
+			break;	/* All skb's allocated */
+
+		skb = netdev_alloc_skb_ip_align(ndev, XTE_MAX_JUMBO_FRAME_SIZE);
+		if (!skb) {
+			dev_warn(&ndev->dev, "skb alloc failed\n");
+			break;
 		}
 
-		cur_p->app0 = cpu_to_be32(STS_CTRL_APP0_IRQONEND);
-		skb_dma_addr = dma_map_single(ndev->dev.parent, new_skb->data,
+		skb_dma_addr = dma_map_single(ndev->dev.parent, skb->data,
 					      XTE_MAX_JUMBO_FRAME_SIZE,
 					      DMA_FROM_DEVICE);
-		cur_p->phys = cpu_to_be32(skb_dma_addr);
-		cur_p->len = cpu_to_be32(XTE_MAX_JUMBO_FRAME_SIZE);
-		lp->rx_skb[lp->rx_bd_ci] = new_skb;
+		if (WARN_ON_ONCE(dma_mapping_error(ndev->dev.parent,
+						   skb_dma_addr))) {
+			dev_kfree_skb_any(skb);
+			break;
+		}
 
-		lp->rx_bd_ci++;
-		if (lp->rx_bd_ci >= RX_BD_NUM)
-			lp->rx_bd_ci = 0;
+		bd->phys = cpu_to_be32(skb_dma_addr);
+		bd->len = cpu_to_be32(XTE_MAX_JUMBO_FRAME_SIZE);
+		bd->app0 = cpu_to_be32(STS_CTRL_APP0_IRQONEND);
+		lp->rx_skb[rx_bd] = skb;
 
-		cur_p = &lp->rx_bd_v[lp->rx_bd_ci];
-		bdstat = be32_to_cpu(cur_p->app0);
+		lp->rx_bd_tail = rx_bd;
+		update_tail = true;
+	}
+
+	/* Move tail pointer when buffers have been allocated */
+	if (update_tail) {
+		lp->dma_out(lp, RX_TAILDESC_PTR,
+			lp->rx_bd_p + sizeof(*lp->rx_bd_v) * lp->rx_bd_tail);
 	}
-	lp->dma_out(lp, RX_TAILDESC_PTR, tail_p);
 
 	spin_unlock_irqrestore(&lp->rx_lock, flags);
 }

From 1d63b8d66d146deaaedbe16c80de105f685ea012 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Fri, 21 Feb 2020 07:47:58 +0100
Subject: [PATCH 182/239] net: ll_temac: Handle DMA halt condition caused by
 buffer underrun

The SDMA engine used by TEMAC halts operation when it has finished
processing of the last buffer descriptor in the buffer ring.
Unfortunately, no interrupt event is generated when this happens,
so we need to setup another mechanism to make sure DMA operation is
restarted when enough buffers have been added to the ring.

Fixes: 92744989533c ("net: add Xilinx ll_temac device driver")
Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/xilinx/ll_temac.h      |  3 ++
 drivers/net/ethernet/xilinx/ll_temac_main.c | 58 +++++++++++++++++++--
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h
index 99fe059e5c7f..53fb8141f1a6 100644
--- a/drivers/net/ethernet/xilinx/ll_temac.h
+++ b/drivers/net/ethernet/xilinx/ll_temac.h
@@ -380,6 +380,9 @@ struct temac_local {
 	/* DMA channel control setup */
 	u32 tx_chnl_ctrl;
 	u32 rx_chnl_ctrl;
+	u8 coalesce_count_rx;
+
+	struct delayed_work restart_work;
 };
 
 /* Wrappers for temac_ior()/temac_iow() function pointers above */
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 255207f2fd27..9461acec6f70 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -51,6 +51,7 @@
 #include <linux/ip.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/workqueue.h>
 #include <linux/dma-mapping.h>
 #include <linux/processor.h>
 #include <linux/platform_data/xilinx-ll-temac.h>
@@ -866,8 +867,11 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	skb_dma_addr = dma_map_single(ndev->dev.parent, skb->data,
 				      skb_headlen(skb), DMA_TO_DEVICE);
 	cur_p->len = cpu_to_be32(skb_headlen(skb));
-	if (WARN_ON_ONCE(dma_mapping_error(ndev->dev.parent, skb_dma_addr)))
-		return NETDEV_TX_BUSY;
+	if (WARN_ON_ONCE(dma_mapping_error(ndev->dev.parent, skb_dma_addr))) {
+		dev_kfree_skb_any(skb);
+		ndev->stats.tx_dropped++;
+		return NETDEV_TX_OK;
+	}
 	cur_p->phys = cpu_to_be32(skb_dma_addr);
 	ptr_to_txbd((void *)skb, cur_p);
 
@@ -897,7 +901,9 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 			dma_unmap_single(ndev->dev.parent,
 					 be32_to_cpu(cur_p->phys),
 					 skb_headlen(skb), DMA_TO_DEVICE);
-			return NETDEV_TX_BUSY;
+			dev_kfree_skb_any(skb);
+			ndev->stats.tx_dropped++;
+			return NETDEV_TX_OK;
 		}
 		cur_p->phys = cpu_to_be32(skb_dma_addr);
 		cur_p->len = cpu_to_be32(skb_frag_size(frag));
@@ -920,6 +926,17 @@ temac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	return NETDEV_TX_OK;
 }
 
+static int ll_temac_recv_buffers_available(struct temac_local *lp)
+{
+	int available;
+
+	if (!lp->rx_skb[lp->rx_bd_ci])
+		return 0;
+	available = 1 + lp->rx_bd_tail - lp->rx_bd_ci;
+	if (available <= 0)
+		available += RX_BD_NUM;
+	return available;
+}
 
 static void ll_temac_recv(struct net_device *ndev)
 {
@@ -990,6 +1007,18 @@ static void ll_temac_recv(struct net_device *ndev)
 			lp->rx_bd_ci = 0;
 	} while (rx_bd != lp->rx_bd_tail);
 
+	/* DMA operations will halt when the last buffer descriptor is
+	 * processed (ie. the one pointed to by RX_TAILDESC_PTR).
+	 * When that happens, no more interrupt events will be
+	 * generated.  No IRQ_COAL or IRQ_DLY, and not even an
+	 * IRQ_ERR.  To avoid stalling, we schedule a delayed work
+	 * when there is a potential risk of that happening.  The work
+	 * will call this function, and thus re-schedule itself until
+	 * enough buffers are available again.
+	 */
+	if (ll_temac_recv_buffers_available(lp) < lp->coalesce_count_rx)
+		schedule_delayed_work(&lp->restart_work, HZ / 1000);
+
 	/* Allocate new buffers for those buffer descriptors that were
 	 * passed to network stack.  Note that GFP_ATOMIC allocations
 	 * can fail (e.g. when a larger burst of GFP_ATOMIC
@@ -1045,6 +1074,18 @@ static void ll_temac_recv(struct net_device *ndev)
 	spin_unlock_irqrestore(&lp->rx_lock, flags);
 }
 
+/* Function scheduled to ensure a restart in case of DMA halt
+ * condition caused by running out of buffer descriptors.
+ */
+static void ll_temac_restart_work_func(struct work_struct *work)
+{
+	struct temac_local *lp = container_of(work, struct temac_local,
+					      restart_work.work);
+	struct net_device *ndev = lp->ndev;
+
+	ll_temac_recv(ndev);
+}
+
 static irqreturn_t ll_temac_tx_irq(int irq, void *_ndev)
 {
 	struct net_device *ndev = _ndev;
@@ -1137,6 +1178,8 @@ static int temac_stop(struct net_device *ndev)
 
 	dev_dbg(&ndev->dev, "temac_close()\n");
 
+	cancel_delayed_work_sync(&lp->restart_work);
+
 	free_irq(lp->tx_irq, ndev);
 	free_irq(lp->rx_irq, ndev);
 
@@ -1258,6 +1301,7 @@ static int temac_probe(struct platform_device *pdev)
 	lp->dev = &pdev->dev;
 	lp->options = XTE_OPTION_DEFAULTS;
 	spin_lock_init(&lp->rx_lock);
+	INIT_DELAYED_WORK(&lp->restart_work, ll_temac_restart_work_func);
 
 	/* Setup mutex for synchronization of indirect register access */
 	if (pdata) {
@@ -1364,6 +1408,7 @@ static int temac_probe(struct platform_device *pdev)
 		 */
 		lp->tx_chnl_ctrl = 0x10220000;
 		lp->rx_chnl_ctrl = 0xff070000;
+		lp->coalesce_count_rx = 0x07;
 
 		/* Finished with the DMA node; drop the reference */
 		of_node_put(dma_np);
@@ -1395,11 +1440,14 @@ static int temac_probe(struct platform_device *pdev)
 				(pdata->tx_irq_count << 16);
 		else
 			lp->tx_chnl_ctrl = 0x10220000;
-		if (pdata->rx_irq_timeout || pdata->rx_irq_count)
+		if (pdata->rx_irq_timeout || pdata->rx_irq_count) {
 			lp->rx_chnl_ctrl = (pdata->rx_irq_timeout << 24) |
 				(pdata->rx_irq_count << 16);
-		else
+			lp->coalesce_count_rx = pdata->rx_irq_count;
+		} else {
 			lp->rx_chnl_ctrl = 0xff070000;
+			lp->coalesce_count_rx = 0x07;
+		}
 	}
 
 	/* Error handle returned DMA RX and TX interrupts */

From 823d81b0fa2cd83a640734e74caee338b5d3c093 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 24 Feb 2020 18:46:22 +0200
Subject: [PATCH 183/239] net: bridge: fix stale eth hdr pointer in br_dev_xmit

In br_dev_xmit() we perform vlan filtering in br_allowed_ingress() but
if the packet has the vlan header inside (e.g. bridge with disabled
tx-vlan-offload) then the vlan filtering code will use skb_vlan_untag()
to extract the vid before filtering which in turn calls pskb_may_pull()
and we may end up with a stale eth pointer. Moreover the cached eth header
pointer will generally be wrong after that operation. Remove the eth header
caching and just use eth_hdr() directly, the compiler does the right thing
and calculates it only once so we don't lose anything.

Fixes: 057658cb33fb ("bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_device.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index dc3d2c1dd9d5..0e3dbc5f3c34 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -34,7 +34,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	const struct nf_br_ops *nf_ops;
 	u8 state = BR_STATE_FORWARDING;
 	const unsigned char *dest;
-	struct ethhdr *eth;
 	u16 vid = 0;
 
 	rcu_read_lock();
@@ -54,15 +53,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
 
 	skb_reset_mac_header(skb);
-	eth = eth_hdr(skb);
 	skb_pull(skb, ETH_HLEN);
 
 	if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state))
 		goto out;
 
 	if (IS_ENABLED(CONFIG_INET) &&
-	    (eth->h_proto == htons(ETH_P_ARP) ||
-	     eth->h_proto == htons(ETH_P_RARP)) &&
+	    (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) ||
+	     eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) &&
 	    br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
 		br_do_proxy_suppress_arp(skb, br, vid, NULL);
 	} else if (IS_ENABLED(CONFIG_IPV6) &&

From 2e90ca68b0d2f5548804f22f0dd61145516171e3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 21 Feb 2020 12:43:35 -0800
Subject: [PATCH 184/239] floppy: check FDC index for errors before assigning
 it

Jordy Zomer reported a KASAN out-of-bounds read in the floppy driver in
wait_til_ready().

Which on the face of it can't happen, since as Willy Tarreau points out,
the function does no particular memory access.  Except through the FDCS
macro, which just indexes a static allocation through teh current fdc,
which is always checked against N_FDC.

Except the checking happens after we've already assigned the value.

The floppy driver is a disgrace (a lot of it going back to my original
horrd "design"), and has no real maintainer.  Nobody has the hardware,
and nobody really cares.  But it still gets used in virtual environment
because it's one of those things that everybody supports.

The whole thing should be re-written, or at least parts of it should be
seriously cleaned up.  The 'current fdc' index, which is used by the
FDCS macro, and which is often shadowed by a local 'fdc' variable, is a
prime example of how not to write code.

But because nobody has the hardware or the motivation, let's just fix up
the immediate problem with a nasty band-aid: test the fdc index before
actually assigning it to the static 'fdc' variable.

Reported-by: Jordy Zomer <jordy@simplyhacker.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/floppy.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cd3612e4e2e1..8ef65c085640 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -853,14 +853,17 @@ static void reset_fdc_info(int mode)
 /* selects the fdc and drive, and enables the fdc's input/dma. */
 static void set_fdc(int drive)
 {
+	unsigned int new_fdc = fdc;
+
 	if (drive >= 0 && drive < N_DRIVE) {
-		fdc = FDC(drive);
+		new_fdc = FDC(drive);
 		current_drive = drive;
 	}
-	if (fdc != 1 && fdc != 0) {
+	if (new_fdc >= N_FDC) {
 		pr_info("bad fdc value\n");
 		return;
 	}
+	fdc = new_fdc;
 	set_dor(fdc, ~0, 8);
 #if N_FDC > 1
 	set_dor(1 - fdc, ~8, 0);

From a0a31fd84f8f66828790d860545d4167777d58c6 Mon Sep 17 00:00:00 2001
From: Zong Li <zong.li@sifive.com>
Date: Fri, 7 Feb 2020 17:52:44 +0800
Subject: [PATCH 185/239] riscv: allocate a complete page size for each page
 table

Each page table should be created by allocating a complete page size
for it. Otherwise, the content of the page table would be corrupted
somewhere through memory allocation which allocates the memory at the
middle of the page table for other use.

Signed-off-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/mm/kasan_init.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index f0cc86040587..f8eaf7e73a23 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -46,29 +46,34 @@ asmlinkage void __init kasan_early_init(void)
 
 static void __init populate(void *start, void *end)
 {
-	unsigned long i;
+	unsigned long i, offset;
 	unsigned long vaddr = (unsigned long)start & PAGE_MASK;
 	unsigned long vend = PAGE_ALIGN((unsigned long)end);
 	unsigned long n_pages = (vend - vaddr) / PAGE_SIZE;
+	unsigned long n_ptes =
+	    ((n_pages + PTRS_PER_PTE) & -PTRS_PER_PTE) / PTRS_PER_PTE;
 	unsigned long n_pmds =
-		(n_pages % PTRS_PER_PTE) ? n_pages / PTRS_PER_PTE + 1 :
-						n_pages / PTRS_PER_PTE;
+	    ((n_ptes + PTRS_PER_PMD) & -PTRS_PER_PMD) / PTRS_PER_PMD;
+
+	pte_t *pte =
+	    memblock_alloc(n_ptes * PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE);
+	pmd_t *pmd =
+	    memblock_alloc(n_pmds * PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
 	pgd_t *pgd = pgd_offset_k(vaddr);
-	pmd_t *pmd = memblock_alloc(n_pmds * sizeof(pmd_t), PAGE_SIZE);
-	pte_t *pte = memblock_alloc(n_pages * sizeof(pte_t), PAGE_SIZE);
 
 	for (i = 0; i < n_pages; i++) {
 		phys_addr_t phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
-
-		set_pte(pte + i, pfn_pte(PHYS_PFN(phys), PAGE_KERNEL));
+		set_pte(&pte[i], pfn_pte(PHYS_PFN(phys), PAGE_KERNEL));
 	}
 
-	for (i = 0; i < n_pmds; ++pgd, i += PTRS_PER_PMD)
-		set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(((uintptr_t)(pmd + i)))),
+	for (i = 0, offset = 0; i < n_ptes; i++, offset += PTRS_PER_PTE)
+		set_pmd(&pmd[i],
+			pfn_pmd(PFN_DOWN(__pa(&pte[offset])),
 				__pgprot(_PAGE_TABLE)));
 
-	for (i = 0; i < n_pages; ++pmd, i += PTRS_PER_PTE)
-		set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa((uintptr_t)(pte + i))),
+	for (i = 0, offset = 0; i < n_pmds; i++, offset += PTRS_PER_PMD)
+		set_pgd(&pgd[i],
+			pfn_pgd(PFN_DOWN(__pa(&pmd[offset])),
 				__pgprot(_PAGE_TABLE)));
 
 	flush_tlb_all();

From 8458ca147c204e7db124e8baa8fede219006e80d Mon Sep 17 00:00:00 2001
From: Zong Li <zong.li@sifive.com>
Date: Fri, 7 Feb 2020 17:52:45 +0800
Subject: [PATCH 186/239] riscv: adjust the indent

Adjust the indent to match Linux coding style.

Signed-off-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Palmer Dabbelt <palmerdabbelt@google.com>
---
 arch/riscv/mm/kasan_init.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index f8eaf7e73a23..ec0ca90dd900 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -19,18 +19,20 @@ asmlinkage void __init kasan_early_init(void)
 	for (i = 0; i < PTRS_PER_PTE; ++i)
 		set_pte(kasan_early_shadow_pte + i,
 			mk_pte(virt_to_page(kasan_early_shadow_page),
-			PAGE_KERNEL));
+			       PAGE_KERNEL));
 
 	for (i = 0; i < PTRS_PER_PMD; ++i)
 		set_pmd(kasan_early_shadow_pmd + i,
-		 pfn_pmd(PFN_DOWN(__pa((uintptr_t)kasan_early_shadow_pte)),
-			__pgprot(_PAGE_TABLE)));
+			pfn_pmd(PFN_DOWN
+				(__pa((uintptr_t) kasan_early_shadow_pte)),
+				__pgprot(_PAGE_TABLE)));
 
 	for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END;
 	     i += PGDIR_SIZE, ++pgd)
 		set_pgd(pgd,
-		 pfn_pgd(PFN_DOWN(__pa(((uintptr_t)kasan_early_shadow_pmd))),
-			__pgprot(_PAGE_TABLE)));
+			pfn_pgd(PFN_DOWN
+				(__pa(((uintptr_t) kasan_early_shadow_pmd))),
+				__pgprot(_PAGE_TABLE)));
 
 	/* init for swapper_pg_dir */
 	pgd = pgd_offset_k(KASAN_SHADOW_START);
@@ -38,8 +40,9 @@ asmlinkage void __init kasan_early_init(void)
 	for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END;
 	     i += PGDIR_SIZE, ++pgd)
 		set_pgd(pgd,
-		 pfn_pgd(PFN_DOWN(__pa(((uintptr_t)kasan_early_shadow_pmd))),
-			__pgprot(_PAGE_TABLE)));
+			pfn_pgd(PFN_DOWN
+				(__pa(((uintptr_t) kasan_early_shadow_pmd))),
+				__pgprot(_PAGE_TABLE)));
 
 	flush_tlb_all();
 }
@@ -86,7 +89,8 @@ void __init kasan_init(void)
 	unsigned long i;
 
 	kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
-			(void *)kasan_mem_to_shadow((void *)VMALLOC_END));
+				    (void *)kasan_mem_to_shadow((void *)
+								VMALLOC_END));
 
 	for_each_memblock(memory, reg) {
 		void *start = (void *)__va(reg->base);
@@ -95,14 +99,14 @@ void __init kasan_init(void)
 		if (start >= end)
 			break;
 
-		populate(kasan_mem_to_shadow(start),
-			 kasan_mem_to_shadow(end));
+		populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end));
 	};
 
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		set_pte(&kasan_early_shadow_pte[i],
 			mk_pte(virt_to_page(kasan_early_shadow_page),
-			__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_ACCESSED)));
+			       __pgprot(_PAGE_PRESENT | _PAGE_READ |
+					_PAGE_ACCESSED)));
 
 	memset(kasan_early_shadow_page, 0, PAGE_SIZE);
 	init_task.kasan_depth = 0;

From 756125289285f6e55a03861bf4b6257aa3d19a93 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 24 Feb 2020 16:38:57 -0500
Subject: [PATCH 187/239] audit: always check the netlink payload length in
 audit_receive_msg()

This patch ensures that we always check the netlink payload length
in audit_receive_msg() before we take any action on the payload
itself.

Cc: stable@vger.kernel.org
Reported-by: syzbot+399c44bf1f43b8747403@syzkaller.appspotmail.com
Reported-by: syzbot+e4b12d8d202701f08b6d@syzkaller.appspotmail.com
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 17b0d523afb3..9ddfe2aa6671 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
 	audit_log_end(ab);
 }
 
-static int audit_set_feature(struct sk_buff *skb)
+static int audit_set_feature(struct audit_features *uaf)
 {
-	struct audit_features *uaf;
 	int i;
 
 	BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
-	uaf = nlmsg_data(nlmsg_hdr(skb));
 
 	/* if there is ever a version 2 we should handle that here */
 
@@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	u32			seq;
 	void			*data;
+	int			data_len;
 	int			err;
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
@@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	seq  = nlh->nlmsg_seq;
 	data = nlmsg_data(nlh);
+	data_len = nlmsg_len(nlh);
 
 	switch (msg_type) {
 	case AUDIT_GET: {
@@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		struct audit_status	s;
 		memset(&s, 0, sizeof(s));
 		/* guard against past and future API changes */
-		memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+		memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
 		if (s.mask & AUDIT_STATUS_ENABLED) {
 			err = audit_set_enabled(s.enabled);
 			if (err < 0)
@@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return err;
 		break;
 	case AUDIT_SET_FEATURE:
-		err = audit_set_feature(skb);
+		if (data_len < sizeof(struct audit_features))
+			return -EINVAL;
+		err = audit_set_feature(data);
 		if (err)
 			return err;
 		break;
@@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		err = audit_filter(msg_type, AUDIT_FILTER_USER);
 		if (err == 1) { /* match or error */
+			char *str = data;
+
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
 				err = tty_audit_push();
@@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					break;
 			}
 			audit_log_user_recv_msg(&ab, msg_type);
-			if (msg_type != AUDIT_USER_TTY)
+			if (msg_type != AUDIT_USER_TTY) {
+				/* ensure NULL termination */
+				str[data_len - 1] = '\0';
 				audit_log_format(ab, " msg='%.*s'",
 						 AUDIT_MESSAGE_TEXT_MAX,
-						 (char *)data);
-			else {
-				int size;
-
+						 str);
+			} else {
 				audit_log_format(ab, " data=");
-				size = nlmsg_len(nlh);
-				if (size > 0 &&
-				    ((unsigned char *)data)[size - 1] == '\0')
-					size--;
-				audit_log_n_untrustedstring(ab, data, size);
+				if (data_len > 0 && str[data_len - 1] == '\0')
+					data_len--;
+				audit_log_n_untrustedstring(ab, str, data_len);
 			}
 			audit_log_end(ab);
 		}
 		break;
 	case AUDIT_ADD_RULE:
 	case AUDIT_DEL_RULE:
-		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
+		if (data_len < sizeof(struct audit_rule_data))
 			return -EINVAL;
 		if (audit_enabled == AUDIT_LOCKED) {
 			audit_log_common_recv_msg(audit_context(), &ab,
@@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_log_end(ab);
 			return -EPERM;
 		}
-		err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
+		err = audit_rule_change(msg_type, seq, data, data_len);
 		break;
 	case AUDIT_LIST_RULES:
 		err = audit_list_rules_send(skb, seq);
@@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_MAKE_EQUIV: {
 		void *bufp = data;
 		u32 sizes[2];
-		size_t msglen = nlmsg_len(nlh);
+		size_t msglen = data_len;
 		char *old, *new;
 
 		err = -EINVAL;
@@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		memset(&s, 0, sizeof(s));
 		/* guard against past and future API changes */
-		memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
+		memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
 		/* check if new data is valid */
 		if ((s.enabled != 0 && s.enabled != 1) ||
 		    (s.log_passwd != 0 && s.log_passwd != 1))

From 3234f4ed3066a58cd5ce8edcf752fa4fe0c95cb5 Mon Sep 17 00:00:00 2001
From: Paul Burton <paulburton@kernel.org>
Date: Sat, 22 Feb 2020 09:04:17 -0800
Subject: [PATCH 188/239] MAINTAINERS: Hand MIPS over to Thomas
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

My time with MIPS the company has reached its end, and so at best I'll
have little time spend on maintaining arch/mips/.

Ralf last authored a patch over 2 years ago, the last time he committed
one is even further back & activity was sporadic for a while before
that. The reality is that he isn't active.

Having a new maintainer with time to do things properly will be
beneficial all round. Thomas Bogendoerfer has been involved in MIPS
development for a long time & has offered to step up as maintainer, so
add Thomas and remove myself & Ralf from the MIPS entry.

Ralf already has an entry in CREDITS to honor his contributions, so this
just adds one for me.

Signed-off-by: Paul Burton <paulburton@kernel.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@vger.kernel.org
---
 CREDITS     | 5 +++++
 MAINTAINERS | 6 ++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/CREDITS b/CREDITS
index a97d3280a627..032b5994f476 100644
--- a/CREDITS
+++ b/CREDITS
@@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem
 S: Orlando, Florida
 S: USA
 
+N: Paul Burton
+E: paulburton@kernel.org
+W: https://pburton.com
+D: MIPS maintainer 2018-2020
+
 N: Lennert Buytenhek
 E: kernel@wantstofly.org
 D: Original (2.4) rewrite of the ethernet bridging code
diff --git a/MAINTAINERS b/MAINTAINERS
index 38fe2f3f7b6f..2c546977fb88 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11114,14 +11114,12 @@ S:	Maintained
 F:	drivers/usb/image/microtek.*
 
 MIPS
-M:	Ralf Baechle <ralf@linux-mips.org>
-M:	Paul Burton <paulburton@kernel.org>
+M:	Thomas Bogendoerfer <tsbogend@alpha.franken.de>
 L:	linux-mips@vger.kernel.org
 W:	http://www.linux-mips.org/
-T:	git git://git.linux-mips.org/pub/scm/ralf/linux.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git
 Q:	http://patchwork.linux-mips.org/project/linux-mips/list/
-S:	Supported
+S:	Maintained
 F:	Documentation/devicetree/bindings/mips/
 F:	Documentation/mips/
 F:	arch/mips/

From 53ace1195263b30fd593677dd67559e879ed9aa2 Mon Sep 17 00:00:00 2001
From: Stephen Kitt <steve@sk2.org>
Date: Fri, 21 Feb 2020 21:57:33 +0100
Subject: [PATCH 189/239] docs: remove MPX from the x86 toc

MPX was removed in commit 45fc24e89b7c ("x86/mpx: remove MPX from
arch/x86"), this removes the corresponding entry in the x86 toc.

This was suggested by a Sphinx warning.

Signed-off-by: Stephen Kitt <steve@sk2.org>
Fixes: 45fc24e89b7cc ("x86/mpx: remove MPX from arch/x86")
Acked-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/x86/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index a8de2fbc1caa..265d9e9a093b 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -19,7 +19,6 @@ x86-specific Documentation
    tlb
    mtrr
    pat
-   intel_mpx
    intel-iommu
    intel_txt
    amd-memory-encryption

From adc10f5b0a03606e30c704cff1f0283a696d0260 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 21 Feb 2020 16:02:39 -0800
Subject: [PATCH 190/239] docs: Fix empty parallelism argument

When there was no parallelism (no top-level -j arg and a pre-1.7
sphinx-build), the argument passed would be empty ("") instead of just
being missing, which would (understandably) badly confuse sphinx-build.
Fix this by removing the quotes.

Reported-by: Rafael J. Wysocki <rafael@kernel.org>
Fixes: 51e46c7a4007 ("docs, parallelism: Rearrange how jobserver reservations are made")
Cc: stable@vger.kernel.org  # v5.5 only
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/sphinx/parallel-wrapper.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/sphinx/parallel-wrapper.sh b/Documentation/sphinx/parallel-wrapper.sh
index 7daf5133bdd3..e54c44ce117d 100644
--- a/Documentation/sphinx/parallel-wrapper.sh
+++ b/Documentation/sphinx/parallel-wrapper.sh
@@ -30,4 +30,4 @@ if [ -n "$parallel" ] ; then
 	parallel="-j$parallel"
 fi
 
-exec "$sphinx" "$parallel" "$@"
+exec "$sphinx" $parallel "$@"

From d0820556507bd7aef4f3a615b1b6eb66eb9785fe Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Feb 2020 03:11:56 +0100
Subject: [PATCH 191/239] selftests: nft_concat_range: Move option for 'list
 ruleset' before command

Before nftables commit fb9cea50e8b3 ("main: enforce options before
commands"), 'nft list ruleset -a' happened to work, but it's wrong
and won't work anymore. Replace it by 'nft -a list ruleset'.

Reported-by: Chen Yi <yiche@redhat.com>
Fixes: 611973c1e06f ("selftests: netfilter: Introduce tests for sets with range concatenation")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../testing/selftests/netfilter/nft_concat_range.sh  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
index aca21dde102a..5c1033ee1b39 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -1025,7 +1025,7 @@ format_noconcat() {
 add() {
 	if ! nft add element inet filter test "${1}"; then
 		err "Failed to add ${1} given ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 }
@@ -1045,7 +1045,7 @@ add_perf() {
 add_perf_norange() {
 	if ! nft add element netdev perf norange "${1}"; then
 		err "Failed to add ${1} given ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 }
@@ -1054,7 +1054,7 @@ add_perf_norange() {
 add_perf_noconcat() {
 	if ! nft add element netdev perf noconcat "${1}"; then
 		err "Failed to add ${1} given ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 }
@@ -1063,7 +1063,7 @@ add_perf_noconcat() {
 del() {
 	if ! nft delete element inet filter test "${1}"; then
 		err "Failed to delete ${1} given ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 }
@@ -1134,7 +1134,7 @@ send_match() {
 		err "  $(for f in ${src}; do
 			 eval format_\$f "${2}"; printf ' '; done)"
 		err "should have matched ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 	nft reset counter inet filter test >/dev/null
@@ -1160,7 +1160,7 @@ send_nomatch() {
 		err "  $(for f in ${src}; do
 			 eval format_\$f "${2}"; printf ' '; done)"
 		err "should not have matched ruleset:"
-		err "$(nft list ruleset -a)"
+		err "$(nft -a list ruleset)"
 		return 1
 	fi
 }

From a8e41f6033a0c5633d55d6e35993c9e2005d872f Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 25 Feb 2020 18:05:35 +0800
Subject: [PATCH 192/239] icmp: allow icmpv6_ndo_send to work with
 CONFIG_IPV6=n

The icmpv6_send function has long had a static inline implementation
with an empty body for CONFIG_IPV6=n, so that code calling it doesn't
need to be ifdef'd. The new icmpv6_ndo_send function, which is intended
for drivers as a drop-in replacement with an identical function
signature, should follow the same pattern. Without this patch, drivers
that used to work with CONFIG_IPV6=n now result in a linker error.

Cc: Chen Zhou <chenzhou10@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: 0b41713b6066 ("icmp: introduce helper for nat'd source address in network device context")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index 93338fd54af8..33d379602314 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -22,19 +22,23 @@ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
 int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 			       unsigned int data_len);
 
+#if IS_ENABLED(CONFIG_NF_NAT)
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
+#else
+#define icmpv6_ndo_send icmpv6_send
+#endif
+
 #else
 
 static inline void icmpv6_send(struct sk_buff *skb,
 			       u8 type, u8 code, __u32 info)
 {
-
 }
-#endif
 
-#if IS_ENABLED(CONFIG_NF_NAT)
-void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info);
-#else
-#define icmpv6_ndo_send icmpv6_send
+static inline void icmpv6_ndo_send(struct sk_buff *skb,
+				   u8 type, u8 code, __u32 info)
+{
+}
 #endif
 
 extern int				icmpv6_init(void);

From 2910b5aa6f545c044173a5cab3dbb7f43e23916d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 25 Feb 2020 23:36:41 +0900
Subject: [PATCH 193/239] bootconfig: Fix CONFIG_BOOTTIME_TRACING dependency
 issue

Since commit d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by
default") also changed the CONFIG_BOOTTIME_TRACING to select
CONFIG_BOOT_CONFIG to show the boot-time tracing on the menu,
it introduced wrong dependencies with BLK_DEV_INITRD as below.

WARNING: unmet direct dependencies detected for BOOT_CONFIG
  Depends on [n]: BLK_DEV_INITRD [=n]
  Selected by [y]:
  - BOOTTIME_TRACING [=y] && TRACING_SUPPORT [=y] && FTRACE [=y] && TRACING [=y]

This makes the CONFIG_BOOT_CONFIG selects CONFIG_BLK_DEV_INITRD to
fix this error and make CONFIG_BOOTTIME_TRACING=n by default, so
that both boot-time tracing and boot configuration off but those
appear on the menu list.

Link: http://lkml.kernel.org/r/158264140162.23842.11237423518607465535.stgit@devnote2

Fixes: d8a953ddde5e ("bootconfig: Set CONFIG_BOOT_CONFIG=n by default")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Compiled-tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 init/Kconfig         | 2 +-
 kernel/trace/Kconfig | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index a84e7aa89a29..8b4c3e8c05ea 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1217,7 +1217,7 @@ endif
 
 config BOOT_CONFIG
 	bool "Boot config support"
-	depends on BLK_DEV_INITRD
+	select BLK_DEV_INITRD
 	help
 	  Extra boot config allows system admin to pass a config file as
 	  complemental extension of kernel cmdline when booting.
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 795c3e02d3f1..402eef84c859 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -145,7 +145,6 @@ config BOOTTIME_TRACING
 	bool "Boot-time Tracing support"
 	depends on TRACING
 	select BOOT_CONFIG
-	default y
 	help
 	  Enable developer to setup ftrace subsystem via supplemental
 	  kernel cmdline at boot time for debugging (tracing) driver

From 212d58c106fd0f2704664be2bb173e14cb4e86d3 Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Feb 2020 03:04:21 +0100
Subject: [PATCH 194/239] nft_set_pipapo: Actually fetch key data in
 nft_pipapo_remove()

Phil reports that adding elements, flushing and re-adding them
right away:

  nft add table t '{ set s { type ipv4_addr . inet_service; flags interval; }; }'
  nft add element t s '{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }'
  nft flush set t s
  nft add element t s '{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }'

triggers, almost reliably, a crash like this one:

  [   71.319848] general protection fault, probably for non-canonical address 0x6f6b6e696c2e756e: 0000 [#1] PREEMPT SMP PTI
  [   71.321540] CPU: 3 PID: 1201 Comm: kworker/3:2 Not tainted 5.6.0-rc1-00377-g2bb07f4e1d861 #192
  [   71.322746] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014
  [   71.324430] Workqueue: events nf_tables_trans_destroy_work [nf_tables]
  [   71.325387] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables]
  [   71.326164] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b
  [   71.328423] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282
  [   71.329225] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0
  [   71.330365] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a
  [   71.331473] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000
  [   71.332627] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2
  [   71.333615] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0
  [   71.334596] FS:  0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000
  [   71.335780] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [   71.336577] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0
  [   71.337533] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [   71.338557] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [   71.339718] Call Trace:
  [   71.340093]  nft_pipapo_destroy+0x7a/0x170 [nf_tables_set]
  [   71.340973]  nft_set_destroy+0x20/0x50 [nf_tables]
  [   71.341879]  nf_tables_trans_destroy_work+0x246/0x260 [nf_tables]
  [   71.342916]  process_one_work+0x1d5/0x3c0
  [   71.343601]  worker_thread+0x4a/0x3c0
  [   71.344229]  kthread+0xfb/0x130
  [   71.344780]  ? process_one_work+0x3c0/0x3c0
  [   71.345477]  ? kthread_park+0x90/0x90
  [   71.346129]  ret_from_fork+0x35/0x40
  [   71.346748] Modules linked in: nf_tables_set nf_tables nfnetlink 8021q [last unloaded: nfnetlink]
  [   71.348153] ---[ end trace 2eaa8149ca759bcc ]---
  [   71.349066] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables]
  [   71.350016] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b
  [   71.350017] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282
  [   71.350019] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0
  [   71.350019] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a
  [   71.350020] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000
  [   71.350021] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2
  [   71.350022] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0
  [   71.350025] FS:  0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000
  [   71.350026] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [   71.350027] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0
  [   71.350028] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [   71.350028] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [   71.350030] Kernel panic - not syncing: Fatal exception
  [   71.350412] Kernel Offset: disabled
  [   71.365922] ---[ end Kernel panic - not syncing: Fatal exception ]---

which is caused by dangling elements that have been deactivated, but
never removed.

On a flush operation, nft_pipapo_walk() walks through all the elements
in the mapping table, which are then deactivated by nft_flush_set(),
one by one, and added to the commit list for removal. Element data is
then freed.

On transaction commit, nft_pipapo_remove() is called, and failed to
remove these elements, leading to the stale references in the mapping.
The first symptom of this, revealed by KASan, is a one-byte
use-after-free in subsequent calls to nft_pipapo_walk(), which is
usually not enough to trigger a panic. When stale elements are used
more heavily, though, such as double-free via nft_pipapo_destroy()
as in Phil's case, the problem becomes more noticeable.

The issue comes from that fact that, on a flush operation,
nft_pipapo_remove() won't get the actual key data via elem->key,
elements to be deleted upon commit won't be found by the lookup via
pipapo_get(), and removal will be skipped. Key data should be fetched
via nft_set_ext_key(), instead.

Reported-by: Phil Sutter <phil@nwl.cc>
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_pipapo.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index feac8553f6d9..4fc0c924ed5d 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1766,11 +1766,13 @@ static bool pipapo_match_field(struct nft_pipapo_field *f,
 static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
 			      const struct nft_set_elem *elem)
 {
-	const u8 *data = (const u8 *)elem->key.val.data;
 	struct nft_pipapo *priv = nft_set_priv(set);
 	struct nft_pipapo_match *m = priv->clone;
+	struct nft_pipapo_elem *e = elem->priv;
 	int rules_f0, first_rule = 0;
-	struct nft_pipapo_elem *e;
+	const u8 *data;
+
+	data = (const u8 *)nft_set_ext_key(&e->ext);
 
 	e = pipapo_get(net, set, data, 0);
 	if (IS_ERR(e))

From 0954df70fba743d8cdaa09ccf6ba8e4ad09628de Mon Sep 17 00:00:00 2001
From: Stefano Brivio <sbrivio@redhat.com>
Date: Fri, 21 Feb 2020 03:04:22 +0100
Subject: [PATCH 195/239] selftests: nft_concat_range: Add test for reported
 add/flush/add issue

Add a specific test for the crash reported by Phil Sutter and addressed
in the previous patch. The test cases that, in my intention, should
have covered these cases, that is, the ones from the 'concurrency'
section, don't run these sequences tightly enough and spectacularly
failed to catch this.

While at it, define a convenient way to add these kind of tests, by
adding a "reported issues" test section.

It's more convenient, for this particular test, to execute the set
setup in its own function. However, future test cases like this one
might need to call setup functions, and will typically need no tools
other than nft, so allow for this in check_tools().

The original form of the reproducer used here was provided by Phil.

Reported-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../selftests/netfilter/nft_concat_range.sh   | 43 +++++++++++++++++--
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
index 5c1033ee1b39..5a4938d6dcf2 100755
--- a/tools/testing/selftests/netfilter/nft_concat_range.sh
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -13,11 +13,12 @@
 KSELFTEST_SKIP=4
 
 # Available test groups:
+# - reported_issues: check for issues that were reported in the past
 # - correctness: check that packets match given entries, and only those
 # - concurrency: attempt races between insertion, deletion and lookup
 # - timeout: check that packets match entries until they expire
 # - performance: estimate matching rate, compare with rbtree and hash baselines
-TESTS="correctness concurrency timeout"
+TESTS="reported_issues correctness concurrency timeout"
 [ "${quicktest}" != "1" ] && TESTS="${TESTS} performance"
 
 # Set types, defined by TYPE_ variables below
@@ -25,6 +26,9 @@ TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto
        net_port_net net_mac net_mac_icmp net6_mac_icmp net6_port_net6_port
        net_port_mac_proto_net"
 
+# Reported bugs, also described by TYPE_ variables below
+BUGS="flush_remove_add"
+
 # List of possible paths to pktgen script from kernel tree for performance tests
 PKTGEN_SCRIPT_PATHS="
 	../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
@@ -327,6 +331,12 @@ flood_spec	ip daddr . tcp dport . meta l4proto . ip saddr
 perf_duration	0
 "
 
+# Definition of tests for bugs reported in the past:
+# display	display text for test report
+TYPE_flush_remove_add="
+display		Add two elements, flush, re-add
+"
+
 # Set template for all tests, types and rules are filled in depending on test
 set_template='
 flush ruleset
@@ -440,6 +450,8 @@ setup_set() {
 
 # Check that at least one of the needed tools is available
 check_tools() {
+	[ -z "${tools}" ] && return 0
+
 	__tools=
 	for tool in ${tools}; do
 		if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \
@@ -1430,6 +1442,23 @@ test_performance() {
 	kill "${perf_pid}"
 }
 
+test_bug_flush_remove_add() {
+	set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }'
+	elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }'
+	elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }'
+	for i in `seq 1 100`; do
+		nft add table t ${set_cmd}	|| return ${KSELFTEST_SKIP}
+		nft add element t s ${elem1}	2>/dev/null || return 1
+		nft flush set t s		2>/dev/null || return 1
+		nft add element t s ${elem2}	2>/dev/null || return 1
+	done
+	nft flush ruleset
+}
+
+test_reported_issues() {
+	eval test_bug_"${subtest}"
+}
+
 # Run everything in a separate network namespace
 [ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
 tmp="$(mktemp)"
@@ -1438,9 +1467,15 @@ trap cleanup EXIT
 # Entry point for test runs
 passed=0
 for name in ${TESTS}; do
-	printf "TEST: %s\n" "${name}"
-	for type in ${TYPES}; do
-		eval desc=\$TYPE_"${type}"
+	printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')"
+	if [ "${name}" = "reported_issues" ]; then
+		SUBTESTS="${BUGS}"
+	else
+		SUBTESTS="${TYPES}"
+	fi
+
+	for subtest in ${SUBTESTS}; do
+		eval desc=\$TYPE_"${subtest}"
 		IFS='
 '
 		for __line in ${desc}; do

From 505b12b3861bc79d1b81c815faaf4910469a7006 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 12 Feb 2020 20:40:57 -0800
Subject: [PATCH 196/239] kbuild: add comment for V=2 mode

Complete the comments for valid values of KBUILD_VERBOSE,
specifically for KBUILD_VERBOSE=2.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 0914049d2929..2afa692b39ff 100644
--- a/Makefile
+++ b/Makefile
@@ -68,6 +68,7 @@ unexport GREP_OPTIONS
 #
 # If KBUILD_VERBOSE equals 0 then the above command will be hidden.
 # If KBUILD_VERBOSE equals 1 then the above command is displayed.
+# If KBUILD_VERBOSE equals 2 then give the reason why each target is rebuilt.
 #
 # To put more focus on warnings, be less verbose as default
 # Use 'make V=1' to see the full commands

From eccbde4f6c2b6ebc52b3e9103e6f2f73f5a9f79a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 19 Feb 2020 10:15:19 +0900
Subject: [PATCH 197/239] kbuild: remove wrong documentation about mandatory-y

This sentence does not make sense in the section about mandatory-y.

This seems to be a copy-paste mistake of commit fcc8487d477a ("uapi:
export all headers under uapi directories").

The correct description would be "The convention is to list one
mandatory-y per line ...".

I just removed it instead of fixing it. If such information is needed,
it could be commented in include/asm-generic/Kbuild and
include/uapi/asm-generic/Kbuild.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Documentation/kbuild/makefiles.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 0e0eb2c8da7d..4018ad7c7a11 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -1379,9 +1379,6 @@ See subsequent chapter for the syntax of the Kbuild file.
 	in arch/$(ARCH)/include/(uapi/)/asm, Kbuild will automatically generate
 	a wrapper of the asm-generic one.
 
-	The convention is to list one subdir per line and
-	preferably in alphabetic order.
-
 8 Kbuild Variables
 ==================
 

From 7a04960560640ac5b0b89461f7757322b57d0c7a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 23 Feb 2020 04:04:31 +0900
Subject: [PATCH 198/239] kbuild: fix DT binding schema rule to detect command
 line changes

This if_change_rule is not working properly; it cannot detect any
command line change.

The reason is because cmd-check in scripts/Kbuild.include compares
$(cmd_$@) and $(cmd_$1), but cmd_dtc_dt_yaml does not exist here.

For if_change_rule to work properly, the stem part of cmd_* and rule_*
must match. Because this cmd_and_fixdep invokes cmd_dtc, this rule must
be named rule_dtc.

Fixes: 4f0e3a57d6eb ("kbuild: Add support for DT binding schema checks")
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
---
 scripts/Makefile.lib | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index bae62549e3d2..64b938c10039 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -302,13 +302,13 @@ DT_TMP_SCHEMA := $(objtree)/$(DT_BINDING_DIR)/processed-schema.yaml
 quiet_cmd_dtb_check =	CHECK   $@
       cmd_dtb_check =	$(DT_CHECKER) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ ;
 
-define rule_dtc_dt_yaml
+define rule_dtc
 	$(call cmd_and_fixdep,dtc,yaml)
 	$(call cmd,dtb_check)
 endef
 
 $(obj)/%.dt.yaml: $(src)/%.dts $(DTC) $(DT_TMP_SCHEMA) FORCE
-	$(call if_changed_rule,dtc_dt_yaml)
+	$(call if_changed_rule,dtc)
 
 dtc-tmp = $(subst $(comma),_,$(dot-target).dts.tmp)
 

From fd63fab48f143f73b534821408a303241ed174f9 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 23 Feb 2020 04:04:32 +0900
Subject: [PATCH 199/239] kbuild: remove unneeded semicolon at the end of
 cmd_dtb_check

This trailing semicolon is unneeded.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
---
 scripts/Makefile.lib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 64b938c10039..752ff0a225a9 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -300,7 +300,7 @@ DT_BINDING_DIR := Documentation/devicetree/bindings
 DT_TMP_SCHEMA := $(objtree)/$(DT_BINDING_DIR)/processed-schema.yaml
 
 quiet_cmd_dtb_check =	CHECK   $@
-      cmd_dtb_check =	$(DT_CHECKER) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ ;
+      cmd_dtb_check =	$(DT_CHECKER) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@
 
 define rule_dtc
 	$(call cmd_and_fixdep,dtc,yaml)

From 964a596db8db8c77c9903dd05655696696e6b3ad Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 23 Feb 2020 04:04:33 +0900
Subject: [PATCH 200/239] kbuild: add dtbs_check to PHONY

The dtbs_check should be a phony target, but currently it is not
specified so.

'make dtbs_check' works even if a file named 'dtbs_check' exists
because it depends on another phony target, scripts_dtc, but we
should not rely on it.

Add dtbs_check to PHONY.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2afa692b39ff..51df13599211 100644
--- a/Makefile
+++ b/Makefile
@@ -1239,7 +1239,7 @@ ifneq ($(dtstree),)
 %.dtb: include/config/kernel.release scripts_dtc
 	$(Q)$(MAKE) $(build)=$(dtstree) $(dtstree)/$@
 
-PHONY += dtbs dtbs_install dt_binding_check
+PHONY += dtbs dtbs_install dtbs_check dt_binding_check
 dtbs dtbs_check: include/config/kernel.release scripts_dtc
 	$(Q)$(MAKE) $(build)=$(dtstree)
 

From c473a8d03ea8e03ca9d649b0b6d72fbcf6212c05 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 23 Feb 2020 04:04:34 +0900
Subject: [PATCH 201/239] kbuild: add dt_binding_check to PHONY in a correct
 place

The dt_binding_check is added to PHONY, but it is invisible when
$(dtstree) is empty. So, it is not specified as phony for
ARCH=x86 etc.

Add it to PHONY outside the ifneq ... endif block.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Rob Herring <robh@kernel.org>
---
 Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 51df13599211..1a1a0d271697 100644
--- a/Makefile
+++ b/Makefile
@@ -1239,7 +1239,7 @@ ifneq ($(dtstree),)
 %.dtb: include/config/kernel.release scripts_dtc
 	$(Q)$(MAKE) $(build)=$(dtstree) $(dtstree)/$@
 
-PHONY += dtbs dtbs_install dtbs_check dt_binding_check
+PHONY += dtbs dtbs_install dtbs_check
 dtbs dtbs_check: include/config/kernel.release scripts_dtc
 	$(Q)$(MAKE) $(build)=$(dtstree)
 
@@ -1259,6 +1259,7 @@ PHONY += scripts_dtc
 scripts_dtc: scripts_basic
 	$(Q)$(MAKE) $(build)=scripts/dtc
 
+PHONY += dt_binding_check
 dt_binding_check: scripts_dtc
 	$(Q)$(MAKE) $(build)=Documentation/devicetree/bindings
 

From fda31c50292a5062332fa0343c084bd9f46604d9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 24 Feb 2020 12:47:14 -0800
Subject: [PATCH 202/239] signal: avoid double atomic counter increments for
 user accounting

When queueing a signal, we increment both the users count of pending
signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount
of the user struct itself (because we keep a reference to the user in
the signal structure in order to correctly account for it when freeing).

That turns out to be fairly expensive, because both of them are atomic
updates, and particularly under extreme signal handling pressure on big
machines, you can get a lot of cache contention on the user struct.
That can then cause horrid cacheline ping-pong when you do these
multiple accesses.

So change the reference counting to only pin the user for the _first_
pending signal, and to unpin it when the last pending signal is
dequeued.  That means that when a user sees a lot of concurrent signal
queuing - which is the only situation when this matters - the only
atomic access needed is generally the 'sigpending' count update.

This was noticed because of a particularly odd timing artifact on a
dual-socket 96C/192T Cascade Lake platform: when you get into bad
contention, on that machine for some reason seems to be much worse when
the contention happens in the upper 32-byte half of the cacheline.

As a result, the kernel test robot will-it-scale 'signal1' benchmark had
an odd performance regression simply due to random alignment of the
'struct user_struct' (and pointed to a completely unrelated and
apparently nonsensical commit for the regression).

Avoiding the double increments (and decrements on the dequeueing side,
of course) makes for much less contention and hugely improved
performance on that will-it-scale microbenchmark.

Quoting Feng Tang:

 "It makes a big difference, that the performance score is tripled! bump
  from original 17000 to 54000. Also the gap between 5.0-rc6 and
  5.0-rc6+Jiri's patch is reduced to around 2%"

[ The "2% gap" is the odd cacheline placement difference on that
  platform: under the extreme contention case, the effect of which half
  of the cacheline was hot was 5%, so with the reduced contention the
  odd timing artifact is reduced too ]

It does help in the non-contended case too, but is not nearly as
noticeable.

Reported-and-tested-by: Feng Tang <feng.tang@intel.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Philip Li <philip.li@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 9ad8dea93dbb..5b2396350dd1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 {
 	struct sigqueue *q = NULL;
 	struct user_struct *user;
+	int sigpending;
 
 	/*
 	 * Protect access to @t credentials. This can go away when all
 	 * callers hold rcu read lock.
+	 *
+	 * NOTE! A pending signal will hold on to the user refcount,
+	 * and we get/put the refcount only when the sigpending count
+	 * changes from/to zero.
 	 */
 	rcu_read_lock();
-	user = get_uid(__task_cred(t)->user);
-	atomic_inc(&user->sigpending);
+	user = __task_cred(t)->user;
+	sigpending = atomic_inc_return(&user->sigpending);
+	if (sigpending == 1)
+		get_uid(user);
 	rcu_read_unlock();
 
-	if (override_rlimit ||
-	    atomic_read(&user->sigpending) <=
-			task_rlimit(t, RLIMIT_SIGPENDING)) {
+	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
 	} else {
 		print_dropped_signal(sig);
 	}
 
 	if (unlikely(q == NULL)) {
-		atomic_dec(&user->sigpending);
-		free_uid(user);
+		if (atomic_dec_and_test(&user->sigpending))
+			free_uid(user);
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
@@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
-	atomic_dec(&q->user->sigpending);
-	free_uid(q->user);
+	if (atomic_dec_and_test(&q->user->sigpending))
+		free_uid(q->user);
 	kmem_cache_free(sigqueue_cachep, q);
 }
 

From cfe2ce49b9da3959015e94b08f7494ade3ee0c49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Feb 2020 07:39:29 -0800
Subject: [PATCH 203/239] Revert "KVM: x86: enable -Werror"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit ead68df94d248c80fdbae220ae5425eb5af2e753.

Using the -Werror flag breaks the build for me due to mostly harmless
KASAN or similar warnings:

  arch/x86/kvm/x86.c: In function ‘kvm_timer_init’:
  arch/x86/kvm/x86.c:7209:1: error: the frame size of 1112 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

Feel free to add a CONFIG_WERROR if you care strong enough, but don't
break peoples builds for absolutely no good reason.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4654e97a05cc..b19ef421084d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ccflags-y += -Iarch/x86/kvm
-ccflags-y += -Werror
 
 KVM := ../../../virt/kvm
 

From 6e11d1578fba8d09d03a286740ffcf336d53928c Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Mon, 24 Feb 2020 10:56:00 -0800
Subject: [PATCH 204/239] net: Fix Tx hash bound checking

Fixes the lower and upper bounds when there are multiple TCs and
traffic is on the the same TC on the same device.

The lower bound is represented by 'qoffset' and the upper limit for
hash value is 'qcount + qoffset'. This gives a clean Rx to Tx queue
mapping when there are multiple TCs, as the queue indices for upper TCs
will be offset by 'qoffset'.

v2: Fixed commit description based on comments.

Fixes: 1b837d489e06 ("net: Revoke export for __skb_tx_hash, update it to just be static skb_tx_hash")
Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx")
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Reviewed-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index e10bd680dc03..c6c985fe7b1b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3076,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev,
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
+		if (hash >= qoffset)
+			hash -= qoffset;
 		while (unlikely(hash >= qcount))
 			hash -= qcount;
 		return hash + qoffset;

From e34f1753eebc428c312527662eb1b529cf260240 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Mon, 24 Feb 2020 20:42:12 +0100
Subject: [PATCH 205/239] ethtool: limit bitset size

Syzbot reported that ethnl_compact_sanity_checks() can be tricked into
reading past the end of ETHTOOL_A_BITSET_VALUE and ETHTOOL_A_BITSET_MASK
attributes and even the message by passing a value between (u32)(-31)
and (u32)(-1) as ETHTOOL_A_BITSET_SIZE.

The problem is that DIV_ROUND_UP(attr_nbits, 32) is 0 for such values so
that zero length ETHTOOL_A_BITSET_VALUE will pass the length check but
ethnl_bitmap32_not_zero() check would try to access up to 512 MB of
attribute "payload".

Prevent this overflow byt limiting the bitset size. Technically, compact
bitset format would allow bitset sizes up to almost 2^18 (so that the
nest size does not exceed U16_MAX) but bitsets used by ethtool are much
shorter. S16_MAX, the largest value which can be directly used as an
upper limit in policy, should be a reasonable compromise.

Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling")
Reported-by: syzbot+7fd4ed5b4234ab1fdccd@syzkaller.appspotmail.com
Reported-by: syzbot+709b7a64d57978247e44@syzkaller.appspotmail.com
Reported-by: syzbot+983cb8fb2d17a7af549d@syzkaller.appspotmail.com
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/bitset.c | 3 ++-
 net/ethtool/bitset.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index 8977fe1f3946..ef9197541cb3 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -305,7 +305,8 @@ int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
 static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = {
 	[ETHTOOL_A_BITSET_UNSPEC]	= { .type = NLA_REJECT },
 	[ETHTOOL_A_BITSET_NOMASK]	= { .type = NLA_FLAG },
-	[ETHTOOL_A_BITSET_SIZE]		= { .type = NLA_U32 },
+	[ETHTOOL_A_BITSET_SIZE]		= NLA_POLICY_MAX(NLA_U32,
+							 ETHNL_MAX_BITSET_SIZE),
 	[ETHTOOL_A_BITSET_BITS]		= { .type = NLA_NESTED },
 	[ETHTOOL_A_BITSET_VALUE]	= { .type = NLA_BINARY },
 	[ETHTOOL_A_BITSET_MASK]		= { .type = NLA_BINARY },
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
index b8247e34109d..b849f9d19676 100644
--- a/net/ethtool/bitset.h
+++ b/net/ethtool/bitset.h
@@ -3,6 +3,8 @@
 #ifndef _NET_ETHTOOL_BITSET_H
 #define _NET_ETHTOOL_BITSET_H
 
+#define ETHNL_MAX_BITSET_SIZE S16_MAX
+
 typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
 
 int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);

From 99b79c3900d4627672c85d9f344b5b0f06bc2a4d Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 12 Feb 2020 22:53:52 -0800
Subject: [PATCH 206/239] netfilter: xt_hashlimit: unregister proc file before
 releasing mutex

Before releasing the global mutex, we only unlink the hashtable
from the hash list, its proc file is still not unregistered at
this point. So syzbot could trigger a race condition where a
parallel htable_create() could register the same file immediately
after the mutex is released.

Move htable_remove_proc_entry() back to mutex protection to
fix this. And, fold htable_destroy() into htable_put() to make
the code slightly easier to understand.

Reported-and-tested-by: syzbot+d195fd3b9a364ddd6731@syzkaller.appspotmail.com
Fixes: c4a3922d2d20 ("netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put()")
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_hashlimit.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 7a2c4b8408c4..8c835ad63729 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -402,15 +402,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo)
 		remove_proc_entry(hinfo->name, parent);
 }
 
-static void htable_destroy(struct xt_hashlimit_htable *hinfo)
-{
-	cancel_delayed_work_sync(&hinfo->gc_work);
-	htable_remove_proc_entry(hinfo);
-	htable_selective_cleanup(hinfo, true);
-	kfree(hinfo->name);
-	vfree(hinfo);
-}
-
 static struct xt_hashlimit_htable *htable_find_get(struct net *net,
 						   const char *name,
 						   u_int8_t family)
@@ -432,8 +423,13 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 {
 	if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) {
 		hlist_del(&hinfo->node);
+		htable_remove_proc_entry(hinfo);
 		mutex_unlock(&hashlimit_mutex);
-		htable_destroy(hinfo);
+
+		cancel_delayed_work_sync(&hinfo->gc_work);
+		htable_selective_cleanup(hinfo, true);
+		kfree(hinfo->name);
+		vfree(hinfo);
 	}
 }
 

From 9a005c3898aa07cd5cdca77b7096814e6c478c92 Mon Sep 17 00:00:00 2001
From: Jonathan Lemon <jonathan.lemon@gmail.com>
Date: Mon, 24 Feb 2020 15:29:09 -0800
Subject: [PATCH 207/239] bnxt_en: add newline to netdev_*() format strings

Add missing newlines to netdev_* format strings so the lines
aren't buffered by the printk subsystem.

Nitpicked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Acked-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  4 +-
 .../net/ethernet/broadcom/bnxt/bnxt_devlink.c | 10 ++--
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  4 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c  | 48 +++++++++----------
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 10 ++--
 5 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fd6e0e48cd51..f9a8151f092c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -11252,7 +11252,7 @@ static void bnxt_cfg_ntp_filters(struct bnxt *bp)
 		}
 	}
 	if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event))
-		netdev_info(bp->dev, "Receive PF driver unload event!");
+		netdev_info(bp->dev, "Receive PF driver unload event!\n");
 }
 
 #else
@@ -11759,7 +11759,7 @@ static int bnxt_pcie_dsn_get(struct bnxt *bp, u8 dsn[])
 	u32 dw;
 
 	if (!pos) {
-		netdev_info(bp->dev, "Unable do read adapter's DSN");
+		netdev_info(bp->dev, "Unable do read adapter's DSN\n");
 		return -EOPNOTSUPP;
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index eec0168330b7..d3c93ccee86a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -641,14 +641,14 @@ static int bnxt_dl_params_register(struct bnxt *bp)
 	rc = devlink_params_register(bp->dl, bnxt_dl_params,
 				     ARRAY_SIZE(bnxt_dl_params));
 	if (rc) {
-		netdev_warn(bp->dev, "devlink_params_register failed. rc=%d",
+		netdev_warn(bp->dev, "devlink_params_register failed. rc=%d\n",
 			    rc);
 		return rc;
 	}
 	rc = devlink_port_params_register(&bp->dl_port, bnxt_dl_port_params,
 					  ARRAY_SIZE(bnxt_dl_port_params));
 	if (rc) {
-		netdev_err(bp->dev, "devlink_port_params_register failed");
+		netdev_err(bp->dev, "devlink_port_params_register failed\n");
 		devlink_params_unregister(bp->dl, bnxt_dl_params,
 					  ARRAY_SIZE(bnxt_dl_params));
 		return rc;
@@ -679,7 +679,7 @@ int bnxt_dl_register(struct bnxt *bp)
 	else
 		dl = devlink_alloc(&bnxt_vf_dl_ops, sizeof(struct bnxt_dl));
 	if (!dl) {
-		netdev_warn(bp->dev, "devlink_alloc failed");
+		netdev_warn(bp->dev, "devlink_alloc failed\n");
 		return -ENOMEM;
 	}
 
@@ -692,7 +692,7 @@ int bnxt_dl_register(struct bnxt *bp)
 
 	rc = devlink_register(dl, &bp->pdev->dev);
 	if (rc) {
-		netdev_warn(bp->dev, "devlink_register failed. rc=%d", rc);
+		netdev_warn(bp->dev, "devlink_register failed. rc=%d\n", rc);
 		goto err_dl_free;
 	}
 
@@ -704,7 +704,7 @@ int bnxt_dl_register(struct bnxt *bp)
 			       sizeof(bp->dsn));
 	rc = devlink_port_register(dl, &bp->dl_port, bp->pf.port_id);
 	if (rc) {
-		netdev_err(bp->dev, "devlink_port_register failed");
+		netdev_err(bp->dev, "devlink_port_register failed\n");
 		goto err_dl_unreg;
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 6171fa8b3677..e8fc1671c581 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -2028,7 +2028,7 @@ int bnxt_flash_package_from_file(struct net_device *dev, const char *filename,
 	}
 
 	if (fw->size > item_len) {
-		netdev_err(dev, "PKG insufficient update area in nvram: %lu",
+		netdev_err(dev, "PKG insufficient update area in nvram: %lu\n",
 			   (unsigned long)fw->size);
 		rc = -EFBIG;
 	} else {
@@ -3338,7 +3338,7 @@ static int bnxt_get_coredump(struct bnxt *bp, void *buf, u32 *dump_len)
 	kfree(coredump.data);
 	*dump_len += sizeof(struct bnxt_coredump_record);
 	if (rc == -ENOBUFS)
-		netdev_err(bp->dev, "Firmware returned large coredump buffer");
+		netdev_err(bp->dev, "Firmware returned large coredump buffer\n");
 	return rc;
 }
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 0cc6ec51f45f..9bec256b0934 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -50,7 +50,7 @@ static u16 bnxt_flow_get_dst_fid(struct bnxt *pf_bp, struct net_device *dev)
 
 	/* check if dev belongs to the same switch */
 	if (!netdev_port_same_parent_id(pf_bp->dev, dev)) {
-		netdev_info(pf_bp->dev, "dev(ifindex=%d) not on same switch",
+		netdev_info(pf_bp->dev, "dev(ifindex=%d) not on same switch\n",
 			    dev->ifindex);
 		return BNXT_FID_INVALID;
 	}
@@ -70,7 +70,7 @@ static int bnxt_tc_parse_redir(struct bnxt *bp,
 	struct net_device *dev = act->dev;
 
 	if (!dev) {
-		netdev_info(bp->dev, "no dev in mirred action");
+		netdev_info(bp->dev, "no dev in mirred action\n");
 		return -EINVAL;
 	}
 
@@ -106,7 +106,7 @@ static int bnxt_tc_parse_tunnel_set(struct bnxt *bp,
 	const struct ip_tunnel_key *tun_key = &tun_info->key;
 
 	if (ip_tunnel_info_af(tun_info) != AF_INET) {
-		netdev_info(bp->dev, "only IPv4 tunnel-encap is supported");
+		netdev_info(bp->dev, "only IPv4 tunnel-encap is supported\n");
 		return -EOPNOTSUPP;
 	}
 
@@ -295,7 +295,7 @@ static int bnxt_tc_parse_actions(struct bnxt *bp,
 	int i, rc;
 
 	if (!flow_action_has_entries(flow_action)) {
-		netdev_info(bp->dev, "no actions");
+		netdev_info(bp->dev, "no actions\n");
 		return -EINVAL;
 	}
 
@@ -370,7 +370,7 @@ static int bnxt_tc_parse_flow(struct bnxt *bp,
 	/* KEY_CONTROL and KEY_BASIC are needed for forming a meaningful key */
 	if ((dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL)) == 0 ||
 	    (dissector->used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC)) == 0) {
-		netdev_info(bp->dev, "cannot form TC key: used_keys = 0x%x",
+		netdev_info(bp->dev, "cannot form TC key: used_keys = 0x%x\n",
 			    dissector->used_keys);
 		return -EOPNOTSUPP;
 	}
@@ -508,7 +508,7 @@ static int bnxt_hwrm_cfa_flow_free(struct bnxt *bp,
 
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (rc)
-		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s: Error rc=%d\n", __func__, rc);
 
 	return rc;
 }
@@ -841,7 +841,7 @@ static int hwrm_cfa_decap_filter_alloc(struct bnxt *bp,
 		resp = bnxt_get_hwrm_resp_addr(bp, &req);
 		*decap_filter_handle = resp->decap_filter_id;
 	} else {
-		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s: Error rc=%d\n", __func__, rc);
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
@@ -859,7 +859,7 @@ static int hwrm_cfa_decap_filter_free(struct bnxt *bp,
 
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (rc)
-		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s: Error rc=%d\n", __func__, rc);
 
 	return rc;
 }
@@ -906,7 +906,7 @@ static int hwrm_cfa_encap_record_alloc(struct bnxt *bp,
 		resp = bnxt_get_hwrm_resp_addr(bp, &req);
 		*encap_record_handle = resp->encap_record_id;
 	} else {
-		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s: Error rc=%d\n", __func__, rc);
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
@@ -924,7 +924,7 @@ static int hwrm_cfa_encap_record_free(struct bnxt *bp,
 
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (rc)
-		netdev_info(bp->dev, "%s: Error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s: Error rc=%d\n", __func__, rc);
 
 	return rc;
 }
@@ -943,7 +943,7 @@ static int bnxt_tc_put_l2_node(struct bnxt *bp,
 					     tc_info->l2_ht_params);
 		if (rc)
 			netdev_err(bp->dev,
-				   "Error: %s: rhashtable_remove_fast: %d",
+				   "Error: %s: rhashtable_remove_fast: %d\n",
 				   __func__, rc);
 		kfree_rcu(l2_node, rcu);
 	}
@@ -972,7 +972,7 @@ bnxt_tc_get_l2_node(struct bnxt *bp, struct rhashtable *l2_table,
 		if (rc) {
 			kfree_rcu(l2_node, rcu);
 			netdev_err(bp->dev,
-				   "Error: %s: rhashtable_insert_fast: %d",
+				   "Error: %s: rhashtable_insert_fast: %d\n",
 				   __func__, rc);
 			return NULL;
 		}
@@ -1031,7 +1031,7 @@ static bool bnxt_tc_can_offload(struct bnxt *bp, struct bnxt_tc_flow *flow)
 	if ((flow->flags & BNXT_TC_FLOW_FLAGS_PORTS) &&
 	    (flow->l4_key.ip_proto != IPPROTO_TCP &&
 	     flow->l4_key.ip_proto != IPPROTO_UDP)) {
-		netdev_info(bp->dev, "Cannot offload non-TCP/UDP (%d) ports",
+		netdev_info(bp->dev, "Cannot offload non-TCP/UDP (%d) ports\n",
 			    flow->l4_key.ip_proto);
 		return false;
 	}
@@ -1088,7 +1088,7 @@ static int bnxt_tc_put_tunnel_node(struct bnxt *bp,
 		rc =  rhashtable_remove_fast(tunnel_table, &tunnel_node->node,
 					     *ht_params);
 		if (rc) {
-			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d", rc);
+			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d\n", rc);
 			rc = -1;
 		}
 		kfree_rcu(tunnel_node, rcu);
@@ -1129,7 +1129,7 @@ bnxt_tc_get_tunnel_node(struct bnxt *bp, struct rhashtable *tunnel_table,
 	tunnel_node->refcount++;
 	return tunnel_node;
 err:
-	netdev_info(bp->dev, "error rc=%d", rc);
+	netdev_info(bp->dev, "error rc=%d\n", rc);
 	return NULL;
 }
 
@@ -1187,7 +1187,7 @@ static void bnxt_tc_put_decap_l2_node(struct bnxt *bp,
 					     &decap_l2_node->node,
 					     tc_info->decap_l2_ht_params);
 		if (rc)
-			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d", rc);
+			netdev_err(bp->dev, "rhashtable_remove_fast rc=%d\n", rc);
 		kfree_rcu(decap_l2_node, rcu);
 	}
 }
@@ -1227,7 +1227,7 @@ static int bnxt_tc_resolve_tunnel_hdrs(struct bnxt *bp,
 
 	rt = ip_route_output_key(dev_net(real_dst_dev), &flow);
 	if (IS_ERR(rt)) {
-		netdev_info(bp->dev, "no route to %pI4b", &flow.daddr);
+		netdev_info(bp->dev, "no route to %pI4b\n", &flow.daddr);
 		return -EOPNOTSUPP;
 	}
 
@@ -1241,7 +1241,7 @@ static int bnxt_tc_resolve_tunnel_hdrs(struct bnxt *bp,
 
 		if (vlan->real_dev != real_dst_dev) {
 			netdev_info(bp->dev,
-				    "dst_dev(%s) doesn't use PF-if(%s)",
+				    "dst_dev(%s) doesn't use PF-if(%s)\n",
 				    netdev_name(dst_dev),
 				    netdev_name(real_dst_dev));
 			rc = -EOPNOTSUPP;
@@ -1253,7 +1253,7 @@ static int bnxt_tc_resolve_tunnel_hdrs(struct bnxt *bp,
 #endif
 	} else if (dst_dev != real_dst_dev) {
 		netdev_info(bp->dev,
-			    "dst_dev(%s) for %pI4b is not PF-if(%s)",
+			    "dst_dev(%s) for %pI4b is not PF-if(%s)\n",
 			    netdev_name(dst_dev), &flow.daddr,
 			    netdev_name(real_dst_dev));
 		rc = -EOPNOTSUPP;
@@ -1262,7 +1262,7 @@ static int bnxt_tc_resolve_tunnel_hdrs(struct bnxt *bp,
 
 	nbr = dst_neigh_lookup(&rt->dst, &flow.daddr);
 	if (!nbr) {
-		netdev_info(bp->dev, "can't lookup neighbor for %pI4b",
+		netdev_info(bp->dev, "can't lookup neighbor for %pI4b\n",
 			    &flow.daddr);
 		rc = -EOPNOTSUPP;
 		goto put_rt;
@@ -1472,7 +1472,7 @@ static int __bnxt_tc_del_flow(struct bnxt *bp,
 	rc = rhashtable_remove_fast(&tc_info->flow_table, &flow_node->node,
 				    tc_info->flow_ht_params);
 	if (rc)
-		netdev_err(bp->dev, "Error: %s: rhashtable_remove_fast rc=%d",
+		netdev_err(bp->dev, "Error: %s: rhashtable_remove_fast rc=%d\n",
 			   __func__, rc);
 
 	kfree_rcu(flow_node, rcu);
@@ -1587,7 +1587,7 @@ static int bnxt_tc_add_flow(struct bnxt *bp, u16 src_fid,
 free_node:
 	kfree_rcu(new_node, rcu);
 done:
-	netdev_err(bp->dev, "Error: %s: cookie=0x%lx error=%d",
+	netdev_err(bp->dev, "Error: %s: cookie=0x%lx error=%d\n",
 		   __func__, tc_flow_cmd->cookie, rc);
 	return rc;
 }
@@ -1700,7 +1700,7 @@ bnxt_hwrm_cfa_flow_stats_get(struct bnxt *bp, int num_flows,
 						le64_to_cpu(resp_bytes[i]);
 		}
 	} else {
-		netdev_info(bp->dev, "error rc=%d", rc);
+		netdev_info(bp->dev, "error rc=%d\n", rc);
 	}
 	mutex_unlock(&bp->hwrm_cmd_lock);
 
@@ -1970,7 +1970,7 @@ static int bnxt_tc_indr_block_event(struct notifier_block *nb,
 						   bp);
 		if (rc)
 			netdev_info(bp->dev,
-				    "Failed to register indirect blk: dev: %s",
+				    "Failed to register indirect blk: dev: %s\n",
 				    netdev->name);
 		break;
 	case NETDEV_UNREGISTER:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index b010b34cdaf8..6f2faf81c1ae 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -43,7 +43,7 @@ static int hwrm_cfa_vfr_alloc(struct bnxt *bp, u16 vf_idx,
 		netdev_dbg(bp->dev, "tx_cfa_action=0x%x, rx_cfa_code=0x%x",
 			   *tx_cfa_action, *rx_cfa_code);
 	} else {
-		netdev_info(bp->dev, "%s error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s error rc=%d\n", __func__, rc);
 	}
 
 	mutex_unlock(&bp->hwrm_cmd_lock);
@@ -60,7 +60,7 @@ static int hwrm_cfa_vfr_free(struct bnxt *bp, u16 vf_idx)
 
 	rc = hwrm_send_message(bp, &req, sizeof(req), HWRM_CMD_TIMEOUT);
 	if (rc)
-		netdev_info(bp->dev, "%s error rc=%d", __func__, rc);
+		netdev_info(bp->dev, "%s error rc=%d\n", __func__, rc);
 	return rc;
 }
 
@@ -465,7 +465,7 @@ static int bnxt_vf_reps_create(struct bnxt *bp)
 	return 0;
 
 err:
-	netdev_info(bp->dev, "%s error=%d", __func__, rc);
+	netdev_info(bp->dev, "%s error=%d\n", __func__, rc);
 	kfree(cfa_code_map);
 	__bnxt_vf_reps_destroy(bp);
 	return rc;
@@ -488,7 +488,7 @@ int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode,
 
 	mutex_lock(&bp->sriov_lock);
 	if (bp->eswitch_mode == mode) {
-		netdev_info(bp->dev, "already in %s eswitch mode",
+		netdev_info(bp->dev, "already in %s eswitch mode\n",
 			    mode == DEVLINK_ESWITCH_MODE_LEGACY ?
 			    "legacy" : "switchdev");
 		rc = -EINVAL;
@@ -508,7 +508,7 @@ int bnxt_dl_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		}
 
 		if (pci_num_vf(bp->pdev) == 0) {
-			netdev_info(bp->dev, "Enable VFs before setting switchdev mode");
+			netdev_info(bp->dev, "Enable VFs before setting switchdev mode\n");
 			rc = -EPERM;
 			goto done;
 		}

From 98c5f7d44fef309e692c24c6d71131ee0f0871fb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 24 Feb 2020 15:56:32 -0800
Subject: [PATCH 208/239] net: dsa: bcm_sf2: Forcibly configure IMP port for
 1Gb/sec

We are still experiencing some packet loss with the existing advanced
congestion buffering (ACB) settings with the IMP port configured for
2Gb/sec, so revert to conservative link speeds that do not produce
packet loss until this is resolved.

Fixes: 8f1880cbe8d0 ("net: dsa: bcm_sf2: Configure IMP port for 2Gb/sec")
Fixes: de34d7084edd ("net: dsa: bcm_sf2: Only 7278 supports 2Gb/sec IMP port")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index d1955543acd1..b0f5280a83cb 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -69,8 +69,7 @@ static void bcm_sf2_imp_setup(struct dsa_switch *ds, int port)
 		/* Force link status for IMP port */
 		reg = core_readl(priv, offset);
 		reg |= (MII_SW_OR | LINK_STS);
-		if (priv->type == BCM7278_DEVICE_ID)
-			reg |= GMII_SPEED_UP_2G;
+		reg &= ~GMII_SPEED_UP_2G;
 		core_writel(priv, reg, offset);
 
 		/* Enable Broadcast, Multicast, Unicast forwarding to IMP port */

From 2eb51c75dcb354f8aef03d7648318b24630632e1 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Tue, 25 Feb 2020 17:57:45 +0530
Subject: [PATCH 209/239] net: core: devlink.c: Use built-in RCU list checking

list_for_each_entry_rcu() has built-in RCU and lock checking.

Pass cond argument to list_for_each_entry_rcu() to silence
false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled.

The devlink->lock is held when devlink_dpipe_table_find()
is called in non RCU read side section. Therefore, pass struct devlink
to devlink_dpipe_table_find() for lockdep checking.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8d0b558be942..5e220809844c 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2103,11 +2103,11 @@ static int devlink_dpipe_entry_put(struct sk_buff *skb,
 
 static struct devlink_dpipe_table *
 devlink_dpipe_table_find(struct list_head *dpipe_tables,
-			 const char *table_name)
+			 const char *table_name, struct devlink *devlink)
 {
 	struct devlink_dpipe_table *table;
-
-	list_for_each_entry_rcu(table, dpipe_tables, list) {
+	list_for_each_entry_rcu(table, dpipe_tables, list,
+				lockdep_is_held(&devlink->lock)) {
 		if (!strcmp(table->name, table_name))
 			return table;
 	}
@@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
 
 	table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name);
+					 table_name, devlink);
 	if (!table)
 		return -EINVAL;
 
@@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink,
 	struct devlink_dpipe_table *table;
 
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name);
+					 table_name, devlink);
 	if (!table)
 		return -EINVAL;
 
@@ -6854,7 +6854,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
 
 	rcu_read_lock();
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name);
+					 table_name, devlink);
 	enabled = false;
 	if (table)
 		enabled = table->counters_enabled;
@@ -6885,7 +6885,8 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 
 	mutex_lock(&devlink->lock);
 
-	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) {
+	if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
+				     devlink)) {
 		err = -EEXIST;
 		goto unlock;
 	}
@@ -6921,7 +6922,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 
 	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name);
+					 table_name, devlink);
 	if (!table)
 		goto unlock;
 	list_del_rcu(&table->list);
@@ -7078,7 +7079,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink,
 
 	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
-					 table_name);
+					 table_name, devlink);
 	if (!table) {
 		err = -EINVAL;
 		goto out;

From eabc8bcb292fb9a5757b0c8ab7751f41b0a104f8 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Thu, 27 Feb 2020 02:44:58 +0900
Subject: [PATCH 210/239] kbuild: get rid of trailing slash from subdir-
 example

obj-* needs a trailing slash for a directory, but subdir-* does not.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 Documentation/kbuild/makefiles.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 4018ad7c7a11..6bc126a14b3d 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -765,7 +765,7 @@ is not sufficient this sometimes needs to be explicit.
 	Example::
 
 		#arch/x86/boot/Makefile
-		subdir- := compressed/
+		subdir- := compressed
 
 The above assignment instructs kbuild to descend down in the
 directory compressed/ when "make clean" is executed.

From 1521a67e6016664941f0917d50cb20053a8826a2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Tue, 25 Feb 2020 13:54:12 +0100
Subject: [PATCH 211/239] sched: act: count in the size of action flags
 bitfield

The put of the flags was added by the commit referenced in fixes tag,
however the size of the message was not extended accordingly.

Fix this by adding size of the flags bitfield to the message size.

Fixes: e38226786022 ("net: sched: update action implementations to support flags")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 90a31b15585f..8c466a712cda 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -186,6 +186,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
 		+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
 		+ cookie_len /* TCA_ACT_COOKIE */
 		+ nla_total_size(0) /* TCA_ACT_STATS nested */
+		+ nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */
 		/* TCA_STATS_BASIC */
 		+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
 		/* TCA_STATS_PKT64 */

From 402482a6a78e5c61d8a2ec6311fc5b4aca392cd6 Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Tue, 25 Feb 2020 14:11:59 +0100
Subject: [PATCH 212/239] net: bcmgenet: Clear ID_MODE_DIS in
 EXT_RGMII_OOB_CTRL when not needed

Outdated Raspberry Pi 4 firmware might configure the external PHY as
rgmii although the kernel currently sets it as rgmii-rxid. This makes
connections unreliable as ID_MODE_DIS is left enabled. To avoid this,
explicitly clear that bit whenever we don't need it.

Fixes: da38802211cc ("net: bcmgenet: Add RGMII_RXID support")
Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/genet/bcmmii.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index 6392a2530183..10244941a7a6 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -294,6 +294,7 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
 	 */
 	if (priv->ext_phy) {
 		reg = bcmgenet_ext_readl(priv, EXT_RGMII_OOB_CTRL);
+		reg &= ~ID_MODE_DIS;
 		reg |= id_mode_dis;
 		if (GENET_IS_V1(priv) || GENET_IS_V2(priv) || GENET_IS_V3(priv))
 			reg |= RGMII_MODE_EN_V123;

From 51e3dfa8906ace90c809235b3d3afebc166b6433 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.ibm.com>
Date: Tue, 25 Feb 2020 16:34:36 +0100
Subject: [PATCH 213/239] net/smc: fix cleanup for linkgroup setup failures

If an SMC connection to a certain peer is setup the first time,
a new linkgroup is created. In case of setup failures, such a
linkgroup is unusable and should disappear. As a first step the
linkgroup is removed from the linkgroup list in smc_lgr_forget().

There are 2 problems:
smc_listen_decline() might be called before linkgroup creation
resulting in a crash due to calling smc_lgr_forget() with
parameter NULL.
If a setup failure occurs after linkgroup creation, the connection
is never unregistered from the linkgroup, preventing linkgroup
freeing.

This patch introduces an enhanced smc_lgr_cleanup_early() function
which
* contains a linkgroup check for early smc_listen_decline()
  invocations
* invokes smc_conn_free() to guarantee unregistering of the
  connection.
* schedules fast linkgroup removal of the unusable linkgroup

And the unused function smcd_conn_free() is removed from smc_core.h.

Fixes: 3b2dec2603d5b ("net/smc: restructure client and server code in af_smc")
Fixes: 2a0674fffb6bc ("net/smc: improve abnormal termination of link groups")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 25 +++++++++++++++----------
 net/smc/smc_core.c | 12 ++++++++++++
 net/smc/smc_core.h |  2 +-
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 90988a511cd5..6fd44bdb0fc3 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -512,15 +512,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
 			     int local_contact)
 {
+	bool is_smcd = smc->conn.lgr->is_smcd;
+
 	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(smc->conn.lgr);
-	if (smc->conn.lgr->is_smcd)
+		smc_lgr_cleanup_early(&smc->conn);
+	else
+		smc_conn_free(&smc->conn);
+	if (is_smcd)
 		/* there is only one lgr role for SMC-D; use server lock */
 		mutex_unlock(&smc_server_lgr_pending);
 	else
 		mutex_unlock(&smc_client_lgr_pending);
 
-	smc_conn_free(&smc->conn);
 	smc->connect_nonblock = 0;
 	return reason_code;
 }
@@ -1091,7 +1094,6 @@ static void smc_listen_out_err(struct smc_sock *new_smc)
 	if (newsmcsk->sk_state == SMC_INIT)
 		sock_put(&new_smc->sk); /* passive closing */
 	newsmcsk->sk_state = SMC_CLOSED;
-	smc_conn_free(&new_smc->conn);
 
 	smc_listen_out(new_smc);
 }
@@ -1102,12 +1104,13 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
 {
 	/* RDMA setup failed, switch back to TCP */
 	if (local_contact == SMC_FIRST_CONTACT)
-		smc_lgr_forget(new_smc->conn.lgr);
+		smc_lgr_cleanup_early(&new_smc->conn);
+	else
+		smc_conn_free(&new_smc->conn);
 	if (reason_code < 0) { /* error, no fallback possible */
 		smc_listen_out_err(new_smc);
 		return;
 	}
-	smc_conn_free(&new_smc->conn);
 	smc_switch_to_fallback(new_smc);
 	new_smc->fallback_rsn = reason_code;
 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
@@ -1170,16 +1173,18 @@ static int smc_listen_ism_init(struct smc_sock *new_smc,
 			    new_smc->conn.lgr->vlan_id,
 			    new_smc->conn.lgr->smcd)) {
 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
-			smc_lgr_forget(new_smc->conn.lgr);
-		smc_conn_free(&new_smc->conn);
+			smc_lgr_cleanup_early(&new_smc->conn);
+		else
+			smc_conn_free(&new_smc->conn);
 		return SMC_CLC_DECL_SMCDNOTALK;
 	}
 
 	/* Create send and receive buffers */
 	if (smc_buf_create(new_smc, true)) {
 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
-			smc_lgr_forget(new_smc->conn.lgr);
-		smc_conn_free(&new_smc->conn);
+			smc_lgr_cleanup_early(&new_smc->conn);
+		else
+			smc_conn_free(&new_smc->conn);
 		return SMC_CLC_DECL_MEM;
 	}
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2249de5379ee..5b085efa3bce 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -162,6 +162,18 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
 	conn->lgr = NULL;
 }
 
+void smc_lgr_cleanup_early(struct smc_connection *conn)
+{
+	struct smc_link_group *lgr = conn->lgr;
+
+	if (!lgr)
+		return;
+
+	smc_conn_free(conn);
+	smc_lgr_forget(lgr);
+	smc_lgr_schedule_free_work_fast(lgr);
+}
+
 /* Send delete link, either as client to request the initiation
  * of the DELETE LINK sequence from server; or as server to
  * initiate the delete processing. See smc_llc_rx_delete_link().
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index c472e12951d1..234ae25f0025 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -296,6 +296,7 @@ struct smc_clc_msg_accept_confirm;
 struct smc_clc_msg_local;
 
 void smc_lgr_forget(struct smc_link_group *lgr);
+void smc_lgr_cleanup_early(struct smc_connection *conn);
 void smc_lgr_terminate(struct smc_link_group *lgr, bool soft);
 void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport);
 void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
@@ -316,7 +317,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
 
 void smc_conn_free(struct smc_connection *conn);
 int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
-void smcd_conn_free(struct smc_connection *conn);
 void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
 int smc_core_init(void);
 void smc_core_exit(void);

From b6f6118901d1e867ac9177bbff3b00b185bd4fdc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Feb 2020 11:52:29 -0800
Subject: [PATCH 214/239] ipv6: restrict IPV6_ADDRFORM operation

IPV6_ADDRFORM is able to transform IPv6 socket to IPv4 one.
While this operation sounds illogical, we have to support it.

One of the things it does for TCP socket is to switch sk->sk_prot
to tcp_prot.

We now have other layers playing with sk->sk_prot, so we should make
sure to not interfere with them.

This patch makes sure sk_prot is the default pointer for TCP IPv6 socket.

syzbot reported :
BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD a0113067 P4D a0113067 PUD a8771067 PMD 0
Oops: 0010 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 10686 Comm: syz-executor.0 Not tainted 5.6.0-rc2-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:0x0
Code: Bad RIP value.
RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246
RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40
RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5
R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098
R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000
FS:  00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 inet_release+0x165/0x1c0 net/ipv4/af_inet.c:427
 __sock_release net/socket.c:605 [inline]
 sock_close+0xe1/0x260 net/socket.c:1283
 __fput+0x2e4/0x740 fs/file_table.c:280
 ____fput+0x15/0x20 fs/file_table.c:313
 task_work_run+0x176/0x1b0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:188 [inline]
 exit_to_usermode_loop arch/x86/entry/common.c:164 [inline]
 prepare_exit_to_usermode+0x480/0x5b0 arch/x86/entry/common.c:195
 syscall_return_slowpath+0x113/0x4a0 arch/x86/entry/common.c:278
 do_syscall_64+0x11f/0x1c0 arch/x86/entry/common.c:304
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x45c429
Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f2ae75dac78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: 0000000000000000 RBX: 00007f2ae75db6d4 RCX: 000000000045c429
RDX: 0000000000000001 RSI: 000000000000011a RDI: 0000000000000004
RBP: 000000000076bf20 R08: 0000000000000038 R09: 0000000000000000
R10: 0000000020000180 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000000a9d R14: 00000000004ccfb4 R15: 000000000076bf2c
Modules linked in:
CR2: 0000000000000000
---[ end trace 82567b5207e87bae ]---
RIP: 0010:0x0
Code: Bad RIP value.
RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246
RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40
RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5
R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098
R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000
FS:  00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot+1938db17e275e85dc328@syzkaller.appspotmail.com
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 79fc012dd2ca..debdaeba5d8c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -183,9 +183,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 					retv = -EBUSY;
 					break;
 				}
-			} else if (sk->sk_protocol != IPPROTO_TCP)
+			} else if (sk->sk_protocol == IPPROTO_TCP) {
+				if (sk->sk_prot != &tcpv6_prot) {
+					retv = -EBUSY;
+					break;
+				}
 				break;
-
+			} else {
+				break;
+			}
 			if (sk->sk_state != TCP_ESTABLISHED) {
 				retv = -ENOTCONN;
 				break;

From f596c87005f7b1baeb7d62d9a9e25d68c3dfae10 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Wed, 26 Feb 2020 11:54:35 +0800
Subject: [PATCH 215/239] slip: not call free_netdev before rtnl_unlock in
 slip_open

As the description before netdev_run_todo, we cannot call free_netdev
before rtnl_unlock, fix it by reorder the code.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Reviewed-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/slip/slip.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 6f4d7ba8b109..babb01888b78 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -863,7 +863,10 @@ static int slip_open(struct tty_struct *tty)
 	tty->disc_data = NULL;
 	clear_bit(SLF_INUSE, &sl->flags);
 	sl_free_netdev(sl->dev);
+	/* do not call free_netdev before rtnl_unlock */
+	rtnl_unlock();
 	free_netdev(sl->dev);
+	return err;
 
 err_exit:
 	rtnl_unlock();

From 4f31c532ad400a34dbdd836c204ed964d1ec2da5 Mon Sep 17 00:00:00 2001
From: Sudheesh Mavila <sudheesh.mavila@amd.com>
Date: Wed, 26 Feb 2020 12:40:45 +0530
Subject: [PATCH 216/239] net: phy: corrected the return value for
 genphy_check_and_restart_aneg and genphy_c45_check_and_restart_aneg

When auto-negotiation is not required, return value should be zero.

Changes v1->v2:
- improved comments and code as Andrew Lunn and Heiner Kallweit suggestion
- fixed issue in genphy_c45_check_and_restart_aneg as Russell King
  suggestion.

Fixes: 2a10ab043ac5 ("net: phy: add genphy_check_and_restart_aneg()")
Fixes: 1af9f16840e9 ("net: phy: add genphy_c45_check_and_restart_aneg()")
Signed-off-by: Sudheesh Mavila <sudheesh.mavila@amd.com>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-c45.c    | 6 +++---
 drivers/net/phy/phy_device.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c
index a1caeee12236..dd2e23fb67c0 100644
--- a/drivers/net/phy/phy-c45.c
+++ b/drivers/net/phy/phy-c45.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL_GPL(genphy_c45_restart_aneg);
  */
 int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart)
 {
-	int ret = 0;
+	int ret;
 
 	if (!restart) {
 		/* Configure and restart aneg if it wasn't set before */
@@ -180,9 +180,9 @@ int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart)
 	}
 
 	if (restart)
-		ret = genphy_c45_restart_aneg(phydev);
+		return genphy_c45_restart_aneg(phydev);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(genphy_c45_check_and_restart_aneg);
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 6131aca79823..c8b0c34030d3 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1793,7 +1793,7 @@ EXPORT_SYMBOL(genphy_restart_aneg);
  */
 int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart)
 {
-	int ret = 0;
+	int ret;
 
 	if (!restart) {
 		/* Advertisement hasn't changed, but maybe aneg was never on to
@@ -1808,9 +1808,9 @@ int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart)
 	}
 
 	if (restart)
-		ret = genphy_restart_aneg(phydev);
+		return genphy_restart_aneg(phydev);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(genphy_check_and_restart_aneg);
 

From dc24f8b4ecd3d6c4153a1ec1bc2006ab32a41b8d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 26 Feb 2020 12:19:03 +0100
Subject: [PATCH 217/239] mptcp: add dummy icsk_sync_mss()

syzbot noted that the master MPTCP socket lacks the icsk_sync_mss
callback, and was able to trigger a null pointer dereference:

BUG: kernel NULL pointer dereference, address: 0000000000000000
PGD 8e171067 P4D 8e171067 PUD 93fa2067 PMD 0
Oops: 0010 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 8984 Comm: syz-executor066 Not tainted 5.6.0-rc2-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:0x0
Code: Bad RIP value.
RSP: 0018:ffffc900020b7b80 EFLAGS: 00010246
RAX: 1ffff110124ba600 RBX: 0000000000000000 RCX: ffff88809fefa600
RDX: ffff8880994cdb18 RSI: 0000000000000000 RDI: ffff8880925d3140
RBP: ffffc900020b7bd8 R08: ffffffff870225be R09: fffffbfff140652a
R10: fffffbfff140652a R11: 0000000000000000 R12: ffff8880925d35d0
R13: ffff8880925d3140 R14: dffffc0000000000 R15: 1ffff110124ba6ba
FS:  0000000001a0b880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 00000000a6d6f000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 cipso_v4_sock_setattr+0x34b/0x470 net/ipv4/cipso_ipv4.c:1888
 netlbl_sock_setattr+0x2a7/0x310 net/netlabel/netlabel_kapi.c:989
 smack_netlabel security/smack/smack_lsm.c:2425 [inline]
 smack_inode_setsecurity+0x3da/0x4a0 security/smack/smack_lsm.c:2716
 security_inode_setsecurity+0xb2/0x140 security/security.c:1364
 __vfs_setxattr_noperm+0x16f/0x3e0 fs/xattr.c:197
 vfs_setxattr fs/xattr.c:224 [inline]
 setxattr+0x335/0x430 fs/xattr.c:451
 __do_sys_fsetxattr fs/xattr.c:506 [inline]
 __se_sys_fsetxattr+0x130/0x1b0 fs/xattr.c:495
 __x64_sys_fsetxattr+0xbf/0xd0 fs/xattr.c:495
 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:294
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x440199
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffcadc19e48 EFLAGS: 00000246 ORIG_RAX: 00000000000000be
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440199
RDX: 0000000020000200 RSI: 00000000200001c0 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000003 R09: 00000000004002c8
R10: 0000000000000009 R11: 0000000000000246 R12: 0000000000401a20
R13: 0000000000401ab0 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
CR2: 0000000000000000

Address the issue adding a dummy icsk_sync_mss callback.
To properly sync the subflows mss and options list we need some
additional infrastructure, which will land to net-next.

Reported-by: syzbot+f4dfece964792d80b139@syzkaller.appspotmail.com
Fixes: 2303f994b3e1 ("mptcp: Associate MPTCP context with TCP socket")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e9aa6807b5be..3c19a8efdcea 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -543,6 +543,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 	}
 }
 
+static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	return 0;
+}
+
 static int __mptcp_init_sock(struct sock *sk)
 {
 	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -551,6 +556,7 @@ static int __mptcp_init_sock(struct sock *sk)
 	__set_bit(MPTCP_SEND_SPACE, &msk->flags);
 
 	msk->first = NULL;
+	inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
 
 	return 0;
 }

From c87a9d6fc6d555e4981f2ded77d9a8cce950743e Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@bootlin.com>
Date: Wed, 26 Feb 2020 16:26:50 +0100
Subject: [PATCH 218/239] net: phy: mscc: fix firmware paths

The firmware paths for the VSC8584 PHYs not not contain the leading
'microchip/' directory, as used in linux-firmware, resulting in an
error when probing the driver. This patch fixes it.

Fixes: a5afc1678044 ("net: phy: mscc: add support for VSC8584 PHY")
Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mscc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/mscc.c b/drivers/net/phy/mscc.c
index 937ac7da2789..f686f40f6bdc 100644
--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -345,11 +345,11 @@ enum macsec_bank {
 				BIT(VSC8531_FORCE_LED_OFF) | \
 				BIT(VSC8531_FORCE_LED_ON))
 
-#define MSCC_VSC8584_REVB_INT8051_FW		"mscc_vsc8584_revb_int8051_fb48.bin"
+#define MSCC_VSC8584_REVB_INT8051_FW		"microchip/mscc_vsc8584_revb_int8051_fb48.bin"
 #define MSCC_VSC8584_REVB_INT8051_FW_START_ADDR	0xe800
 #define MSCC_VSC8584_REVB_INT8051_FW_CRC	0xfb48
 
-#define MSCC_VSC8574_REVB_INT8051_FW		"mscc_vsc8574_revb_int8051_29e8.bin"
+#define MSCC_VSC8574_REVB_INT8051_FW		"microchip/mscc_vsc8574_revb_int8051_29e8.bin"
 #define MSCC_VSC8574_REVB_INT8051_FW_START_ADDR	0x4000
 #define MSCC_VSC8574_REVB_INT8051_FW_CRC	0x29e8
 

From 474a31e13a4e9749fb3ee55794d69d0f17ee0998 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Wed, 26 Feb 2020 18:49:01 +0200
Subject: [PATCH 219/239] net: stmmac: fix notifier registration

We cannot register the same netdev notifier multiple times when probing
stmmac devices. Register the notifier only once in module init, and also
make debugfs creation/deletion safe against simultaneous notifier call.

Fixes: 481a7d154cbb ("stmmac: debugfs entry name is not be changed when udev rename device name.")
Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 5836b21edd7e..7da18c9afa01 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4405,6 +4405,8 @@ static void stmmac_init_fs(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
+	rtnl_lock();
+
 	/* Create per netdev entries */
 	priv->dbgfs_dir = debugfs_create_dir(dev->name, stmmac_fs_dir);
 
@@ -4416,14 +4418,13 @@ static void stmmac_init_fs(struct net_device *dev)
 	debugfs_create_file("dma_cap", 0444, priv->dbgfs_dir, dev,
 			    &stmmac_dma_cap_fops);
 
-	register_netdevice_notifier(&stmmac_notifier);
+	rtnl_unlock();
 }
 
 static void stmmac_exit_fs(struct net_device *dev)
 {
 	struct stmmac_priv *priv = netdev_priv(dev);
 
-	unregister_netdevice_notifier(&stmmac_notifier);
 	debugfs_remove_recursive(priv->dbgfs_dir);
 }
 #endif /* CONFIG_DEBUG_FS */
@@ -4940,14 +4941,14 @@ int stmmac_dvr_remove(struct device *dev)
 
 	netdev_info(priv->dev, "%s: removing driver", __func__);
 
-#ifdef CONFIG_DEBUG_FS
-	stmmac_exit_fs(ndev);
-#endif
 	stmmac_stop_all_dma(priv);
 
 	stmmac_mac_set(priv, priv->ioaddr, false);
 	netif_carrier_off(ndev);
 	unregister_netdev(ndev);
+#ifdef CONFIG_DEBUG_FS
+	stmmac_exit_fs(ndev);
+#endif
 	phylink_destroy(priv->phylink);
 	if (priv->plat->stmmac_rst)
 		reset_control_assert(priv->plat->stmmac_rst);
@@ -5166,6 +5167,7 @@ static int __init stmmac_init(void)
 	/* Create debugfs main directory if it doesn't exist yet */
 	if (!stmmac_fs_dir)
 		stmmac_fs_dir = debugfs_create_dir(STMMAC_RESOURCE_NAME, NULL);
+	register_netdevice_notifier(&stmmac_notifier);
 #endif
 
 	return 0;
@@ -5174,6 +5176,7 @@ static int __init stmmac_init(void)
 static void __exit stmmac_exit(void)
 {
 #ifdef CONFIG_DEBUG_FS
+	unregister_netdevice_notifier(&stmmac_notifier);
 	debugfs_remove_recursive(stmmac_fs_dir);
 #endif
 }

From a2f2ef4a54c0d97aa6a8386f4ff23f36ebb488cf Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.ibm.com>
Date: Wed, 26 Feb 2020 17:52:46 +0100
Subject: [PATCH 220/239] net/smc: check for valid ib_client_data

In smc_ib_remove_dev() check if the provided ib device was actually
initialized for SMC before.

Reported-by: syzbot+84484ccebdd4e5451d91@syzkaller.appspotmail.com
Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_ib.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 548632621f4b..d6ba186f67e2 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -573,6 +573,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
 	struct smc_ib_device *smcibdev;
 
 	smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+	if (!smcibdev || smcibdev->ibdev != ibdev)
+		return;
 	ib_set_client_data(ibdev, &smc_ib_client, NULL);
 	spin_lock(&smc_ib_devices.lock);
 	list_del_init(&smcibdev->list); /* remove from smc_ib_devices */

From 0bff777bd0cba73ad4cd0145696ad284d7e6a99f Mon Sep 17 00:00:00 2001
From: Luo bin <luobin9@huawei.com>
Date: Thu, 27 Feb 2020 06:34:42 +0000
Subject: [PATCH 221/239] hinic: fix a irq affinity bug

can not use a local variable as an input parameter of
irq_set_affinity_hint

Signed-off-by: Luo bin <luobin9@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h | 1 +
 drivers/net/ethernet/huawei/hinic/hinic_rx.c    | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h
index f4a339b10b10..79091e131418 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h
@@ -94,6 +94,7 @@ struct hinic_rq {
 
 	struct hinic_wq         *wq;
 
+	struct cpumask		affinity_mask;
 	u32                     irq;
 	u16                     msix_entry;
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_rx.c b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
index 56ea6d692f1c..2695ad69fca6 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_rx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_rx.c
@@ -475,7 +475,6 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 	struct hinic_hwdev *hwdev = nic_dev->hwdev;
 	struct hinic_rq *rq = rxq->rq;
 	struct hinic_qp *qp;
-	struct cpumask mask;
 	int err;
 
 	rx_add_napi(rxq);
@@ -492,8 +491,8 @@ static int rx_request_irq(struct hinic_rxq *rxq)
 	}
 
 	qp = container_of(rq, struct hinic_qp, rq);
-	cpumask_set_cpu(qp->q_id % num_online_cpus(), &mask);
-	return irq_set_affinity_hint(rq->irq, &mask);
+	cpumask_set_cpu(qp->q_id % num_online_cpus(), &rq->affinity_mask);
+	return irq_set_affinity_hint(rq->irq, &rq->affinity_mask);
 }
 
 static void rx_free_irq(struct hinic_rxq *rxq)

From d2ed69ce9ed3477e2a9527e6b89fe4689d99510e Mon Sep 17 00:00:00 2001
From: Luo bin <luobin9@huawei.com>
Date: Thu, 27 Feb 2020 06:34:43 +0000
Subject: [PATCH 222/239] hinic: fix a bug of setting hw_ioctxt

a reserved field is used to signify prime physical function index
in the latest firmware version, so we must assign a value to it
correctly

Signed-off-by: Luo bin <luobin9@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 1 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h | 2 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h  | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index 6f2cf569a283..79b3d53f2fbf 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -297,6 +297,7 @@ static int set_hw_ioctxt(struct hinic_hwdev *hwdev, unsigned int rq_depth,
 	}
 
 	hw_ioctxt.func_idx = HINIC_HWIF_FUNC_IDX(hwif);
+	hw_ioctxt.ppf_idx = HINIC_HWIF_PPF_IDX(hwif);
 
 	hw_ioctxt.set_cmdq_depth = HW_IOCTXT_SET_CMDQ_DEPTH_DEFAULT;
 	hw_ioctxt.cmdq_depth = 0;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
index b069045de416..66fd2340d447 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
@@ -151,8 +151,8 @@ struct hinic_cmd_hw_ioctxt {
 
 	u8      lro_en;
 	u8      rsvd3;
+	u8      ppf_idx;
 	u8      rsvd4;
-	u8      rsvd5;
 
 	u16     rq_depth;
 	u16     rx_buf_sz_idx;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
index 517794509eb2..c7bb9ceca72c 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
@@ -137,6 +137,7 @@
 #define HINIC_HWIF_FUNC_IDX(hwif)       ((hwif)->attr.func_idx)
 #define HINIC_HWIF_PCI_INTF(hwif)       ((hwif)->attr.pci_intf_idx)
 #define HINIC_HWIF_PF_IDX(hwif)         ((hwif)->attr.pf_idx)
+#define HINIC_HWIF_PPF_IDX(hwif)        ((hwif)->attr.ppf_idx)
 
 #define HINIC_FUNC_TYPE(hwif)           ((hwif)->attr.func_type)
 #define HINIC_IS_PF(hwif)               (HINIC_FUNC_TYPE(hwif) == HINIC_PF)

From 386d4716fd91869e07c731657f2cde5a33086516 Mon Sep 17 00:00:00 2001
From: Luo bin <luobin9@huawei.com>
Date: Thu, 27 Feb 2020 06:34:44 +0000
Subject: [PATCH 223/239] hinic: fix a bug of rss configuration

should use real receive queue number to configure hw rss
indirect table rather than maximal queue number

Signed-off-by: Luo bin <luobin9@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/huawei/hinic/hinic_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index 02a14f5e7fe3..13560975c103 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -356,7 +356,8 @@ static void hinic_enable_rss(struct hinic_dev *nic_dev)
 	if (!num_cpus)
 		num_cpus = num_online_cpus();
 
-	nic_dev->num_qps = min_t(u16, nic_dev->max_qps, num_cpus);
+	nic_dev->num_qps = hinic_hwdev_num_qps(hwdev);
+	nic_dev->num_qps = min_t(u16, nic_dev->num_qps, num_cpus);
 
 	nic_dev->rss_limit = nic_dev->num_qps;
 	nic_dev->num_rss = nic_dev->num_qps;

From 3a12500ed5dd21a63da779ac73503f11085bbc1c Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Wed, 26 Feb 2020 18:29:53 +0100
Subject: [PATCH 224/239] unix: define and set show_fdinfo only if procfs is
 enabled

Follow the pattern used with other *_show_fdinfo functions and only
define unix_show_fdinfo and set it in proto_ops if CONFIG_PROCFS
is set.

Fixes: 3c32da19a858 ("unix: Show number of pending scm files of receive queue in fdinfo")
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 62c12cb5763e..aa6e2530e1ec 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -682,6 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val)
 	return 0;
 }
 
+#ifdef CONFIG_PROCFS
 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -692,6 +693,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 		seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds));
 	}
 }
+#else
+#define unix_show_fdinfo NULL
+#endif
 
 static const struct proto_ops unix_stream_ops = {
 	.family =	PF_UNIX,

From e387f7d5fccf95299135c88b799184c3bef6a705 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 27 Feb 2020 08:22:10 +0100
Subject: [PATCH 225/239] mlx5: register lag notifier for init network
 namespace only

The current code causes problems when the unregistering netdevice could
be different then the registering one.

Since the check in mlx5_lag_netdev_event() does not allow any other
network namespace anyway, fix this by registerting the lag notifier
per init network namespace only.

Fixes: d48834f9d4b4 ("mlx5: Use dev_net netdevice notifier registrations")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Tested-by: Aya Levin <ayal@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c    |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/lag.c       | 11 +++--------
 drivers/net/ethernet/mellanox/mlx5/core/lag.h       |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h |  2 +-
 5 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 966983674663..21de4764d4c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -5147,7 +5147,6 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
 
 static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 {
-	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = priv->mdev;
 
 #ifdef CONFIG_MLX5_CORE_EN_DCB
@@ -5168,7 +5167,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv)
 		mlx5e_monitor_counter_cleanup(priv);
 
 	mlx5e_disable_async_events(priv);
-	mlx5_lag_remove(mdev, netdev);
+	mlx5_lag_remove(mdev);
 }
 
 int mlx5e_update_nic_rx(struct mlx5e_priv *priv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 7b48ccacebe2..6ed307d7f191 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1861,7 +1861,6 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv)
 
 static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 {
-	struct net_device *netdev = priv->netdev;
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 
@@ -1870,7 +1869,7 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv)
 #endif
 	mlx5_notifier_unregister(mdev, &priv->events_nb);
 	cancel_work_sync(&rpriv->uplink_priv.reoffload_flows_work);
-	mlx5_lag_remove(mdev, netdev);
+	mlx5_lag_remove(mdev);
 }
 
 static MLX5E_DEFINE_STATS_GRP(sw_rep, 0);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
index b91eabc09fbc..8e19f6ab8393 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -464,9 +464,6 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
 	struct mlx5_lag *ldev;
 	int changed = 0;
 
-	if (!net_eq(dev_net(ndev), &init_net))
-		return NOTIFY_DONE;
-
 	if ((event != NETDEV_CHANGEUPPER) && (event != NETDEV_CHANGELOWERSTATE))
 		return NOTIFY_DONE;
 
@@ -586,8 +583,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 
 	if (!ldev->nb.notifier_call) {
 		ldev->nb.notifier_call = mlx5_lag_netdev_event;
-		if (register_netdevice_notifier_dev_net(netdev, &ldev->nb,
-							&ldev->nn)) {
+		if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
 			ldev->nb.notifier_call = NULL;
 			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
 		}
@@ -600,7 +596,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 }
 
 /* Must be called with intf_mutex held */
-void mlx5_lag_remove(struct mlx5_core_dev *dev, struct net_device *netdev)
+void mlx5_lag_remove(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev;
 	int i;
@@ -620,8 +616,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev, struct net_device *netdev)
 
 	if (i == MLX5_MAX_PORTS) {
 		if (ldev->nb.notifier_call)
-			unregister_netdevice_notifier_dev_net(netdev, &ldev->nb,
-							      &ldev->nn);
+			unregister_netdevice_notifier_net(&init_net, &ldev->nb);
 		mlx5_lag_mp_cleanup(ldev);
 		cancel_delayed_work_sync(&ldev->bond_work);
 		mlx5_lag_dev_free(ldev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
index 316ab09e2664..f1068aac6406 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@@ -44,7 +44,6 @@ struct mlx5_lag {
 	struct workqueue_struct   *wq;
 	struct delayed_work       bond_work;
 	struct notifier_block     nb;
-	struct netdev_net_notifier	nn;
 	struct lag_mp             lag_mp;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index fcce9e0fc82c..da67b28d6e23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -157,7 +157,7 @@ int mlx5_query_qcam_reg(struct mlx5_core_dev *mdev, u32 *qcam,
 			u8 feature_group, u8 access_reg_group);
 
 void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev);
-void mlx5_lag_remove(struct mlx5_core_dev *dev, struct net_device *netdev);
+void mlx5_lag_remove(struct mlx5_core_dev *dev);
 
 int mlx5_irq_table_init(struct mlx5_core_dev *dev);
 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);

From b82cf17ff1957ec35eaee7dc519c365ecd06ba38 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Thu, 27 Feb 2020 09:44:49 +0000
Subject: [PATCH 226/239] net: phy: marvell: don't interpret PHY status unless
 resolved

Don't attempt to interpret the PHY specific status register unless
the PHY is indicating that the resolution is valid.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 28e33ece4ce1..9a8badafea8a 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -1306,6 +1306,9 @@ static int marvell_read_status_page_an(struct phy_device *phydev,
 		}
 	}
 
+	if (!(status & MII_M1011_PHY_STATUS_RESOLVED))
+		return 0;
+
 	if (status & MII_M1011_PHY_STATUS_FULLDUPLEX)
 		phydev->duplex = DUPLEX_FULL;
 	else
@@ -1365,6 +1368,8 @@ static int marvell_read_status_page(struct phy_device *phydev, int page)
 	linkmode_zero(phydev->lp_advertising);
 	phydev->pause = 0;
 	phydev->asym_pause = 0;
+	phydev->speed = SPEED_UNKNOWN;
+	phydev->duplex = DUPLEX_UNKNOWN;
 
 	if (phydev->autoneg == AUTONEG_ENABLE)
 		err = marvell_read_status_page_an(phydev, fiber, status);

From 93b5cbfa9636d385126f211dca9efa7e3f683202 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:23:52 +0000
Subject: [PATCH 227/239] net: rmnet: fix NULL pointer dereference in
 rmnet_newlink()

rmnet registers IFLA_LINK interface as a lower interface.
But, IFLA_LINK could be NULL.
In the current code, rmnet doesn't check IFLA_LINK.
So, panic would occur.

Test commands:
    modprobe rmnet
    ip link add rmnet0 type rmnet mux_id 1

Splat looks like:
[   36.826109][ T1115] general protection fault, probably for non-canonical address 0xdffffc0000000000I
[   36.838817][ T1115] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
[   36.839908][ T1115] CPU: 1 PID: 1115 Comm: ip Not tainted 5.6.0-rc1+ #447
[   36.840569][ T1115] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   36.841408][ T1115] RIP: 0010:rmnet_newlink+0x54/0x510 [rmnet]
[   36.841986][ T1115] Code: 83 ec 18 48 c1 e9 03 80 3c 01 00 0f 85 d4 03 00 00 48 8b 6a 28 48 b8 00 00 00 00 00 c
[   36.843923][ T1115] RSP: 0018:ffff8880b7e0f1c0 EFLAGS: 00010247
[   36.844756][ T1115] RAX: dffffc0000000000 RBX: ffff8880d14cca00 RCX: 1ffff11016fc1e99
[   36.845859][ T1115] RDX: 0000000000000000 RSI: ffff8880c3d04000 RDI: 0000000000000004
[   36.846961][ T1115] RBP: 0000000000000000 R08: ffff8880b7e0f8b0 R09: ffff8880b6ac2d90
[   36.848020][ T1115] R10: ffffffffc0589a40 R11: ffffed1016d585b7 R12: ffffffff88ceaf80
[   36.848788][ T1115] R13: ffff8880c3d04000 R14: ffff8880b7e0f8b0 R15: ffff8880c3d04000
[   36.849546][ T1115] FS:  00007f50ab3360c0(0000) GS:ffff8880da000000(0000) knlGS:0000000000000000
[   36.851784][ T1115] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   36.852422][ T1115] CR2: 000055871afe5ab0 CR3: 00000000ae246001 CR4: 00000000000606e0
[   36.853181][ T1115] Call Trace:
[   36.853514][ T1115]  __rtnl_newlink+0xbdb/0x1270
[   36.853967][ T1115]  ? lock_downgrade+0x6e0/0x6e0
[   36.854420][ T1115]  ? rtnl_link_unregister+0x220/0x220
[   36.854936][ T1115]  ? lock_acquire+0x164/0x3b0
[   36.855376][ T1115]  ? is_bpf_image_address+0xff/0x1d0
[   36.855884][ T1115]  ? rtnl_newlink+0x4c/0x90
[   36.856304][ T1115]  ? kernel_text_address+0x111/0x140
[   36.856857][ T1115]  ? __kernel_text_address+0xe/0x30
[   36.857440][ T1115]  ? unwind_get_return_address+0x5f/0xa0
[   36.858063][ T1115]  ? create_prof_cpu_mask+0x20/0x20
[   36.858644][ T1115]  ? arch_stack_walk+0x83/0xb0
[   36.859171][ T1115]  ? stack_trace_save+0x82/0xb0
[   36.859710][ T1115]  ? stack_trace_consume_entry+0x160/0x160
[   36.860357][ T1115]  ? deactivate_slab.isra.78+0x2c5/0x800
[   36.860928][ T1115]  ? kasan_unpoison_shadow+0x30/0x40
[   36.861520][ T1115]  ? kmem_cache_alloc_trace+0x135/0x350
[   36.862125][ T1115]  ? rtnl_newlink+0x4c/0x90
[   36.864073][ T1115]  rtnl_newlink+0x65/0x90
[ ... ]

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 06de59521fc4..471e3b2a1403 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -135,6 +135,11 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 	int err = 0;
 	u16 mux_id;
 
+	if (!tb[IFLA_LINK]) {
+		NL_SET_ERR_MSG_MOD(extack, "link not specified");
+		return -EINVAL;
+	}
+
 	real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
 	if (!real_dev || !dev)
 		return -ENODEV;

From 1eb1f43a6e37282348a41e3d68f5e9a6a4359212 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:24:26 +0000
Subject: [PATCH 228/239] net: rmnet: fix NULL pointer dereference in
 rmnet_changelink()

In the rmnet_changelink(), it uses IFLA_LINK without checking
NULL pointer.
tb[IFLA_LINK] could be NULL pointer.
So, NULL-ptr-deref could occur.

rmnet already has a lower interface (real_dev).
So, after this patch, rmnet_changelink() does not use IFLA_LINK anymore.

Test commands:
    modprobe rmnet
    ip link add dummy0 type dummy
    ip link add rmnet0 link dummy0 type rmnet mux_id 1
    ip link set rmnet0 type rmnet mux_id 2

Splat looks like:
[   90.578726][ T1131] general protection fault, probably for non-canonical address 0xdffffc0000000000I
[   90.581121][ T1131] KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
[   90.582380][ T1131] CPU: 2 PID: 1131 Comm: ip Not tainted 5.6.0-rc1+ #447
[   90.584285][ T1131] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   90.587506][ T1131] RIP: 0010:rmnet_changelink+0x5a/0x8a0 [rmnet]
[   90.588546][ T1131] Code: 83 ec 20 48 c1 ea 03 80 3c 02 00 0f 85 6f 07 00 00 48 8b 5e 28 48 b8 00 00 00 00 00 0
[   90.591447][ T1131] RSP: 0018:ffff8880ce78f1b8 EFLAGS: 00010247
[   90.592329][ T1131] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffff8880ce78f8b0
[   90.593253][ T1131] RDX: 0000000000000000 RSI: ffff8880ce78f4a0 RDI: 0000000000000004
[   90.594058][ T1131] RBP: ffff8880cf543e00 R08: 0000000000000002 R09: 0000000000000002
[   90.594859][ T1131] R10: ffffffffc0586a40 R11: 0000000000000000 R12: ffff8880ca47c000
[   90.595690][ T1131] R13: ffff8880ca47c000 R14: ffff8880cf545000 R15: 0000000000000000
[   90.596553][ T1131] FS:  00007f21f6c7e0c0(0000) GS:ffff8880da400000(0000) knlGS:0000000000000000
[   90.597504][ T1131] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   90.599418][ T1131] CR2: 0000556e413db458 CR3: 00000000c917a002 CR4: 00000000000606e0
[   90.600289][ T1131] Call Trace:
[   90.600631][ T1131]  __rtnl_newlink+0x922/0x1270
[   90.601194][ T1131]  ? lock_downgrade+0x6e0/0x6e0
[   90.601724][ T1131]  ? rtnl_link_unregister+0x220/0x220
[   90.602309][ T1131]  ? lock_acquire+0x164/0x3b0
[   90.602784][ T1131]  ? is_bpf_image_address+0xff/0x1d0
[   90.603331][ T1131]  ? rtnl_newlink+0x4c/0x90
[   90.603810][ T1131]  ? kernel_text_address+0x111/0x140
[   90.604419][ T1131]  ? __kernel_text_address+0xe/0x30
[   90.604981][ T1131]  ? unwind_get_return_address+0x5f/0xa0
[   90.605616][ T1131]  ? create_prof_cpu_mask+0x20/0x20
[   90.606304][ T1131]  ? arch_stack_walk+0x83/0xb0
[   90.606985][ T1131]  ? stack_trace_save+0x82/0xb0
[   90.607656][ T1131]  ? stack_trace_consume_entry+0x160/0x160
[   90.608503][ T1131]  ? deactivate_slab.isra.78+0x2c5/0x800
[   90.609336][ T1131]  ? kasan_unpoison_shadow+0x30/0x40
[   90.610096][ T1131]  ? kmem_cache_alloc_trace+0x135/0x350
[   90.610889][ T1131]  ? rtnl_newlink+0x4c/0x90
[   90.611512][ T1131]  rtnl_newlink+0x65/0x90
[ ... ]

Fixes: 23790ef12082 ("net: qualcomm: rmnet: Allow to configure flags for existing devices")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 471e3b2a1403..ac58f584190b 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -300,10 +300,8 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (!dev)
 		return -ENODEV;
 
-	real_dev = __dev_get_by_index(dev_net(dev),
-				      nla_get_u32(tb[IFLA_LINK]));
-
-	if (!real_dev || !rmnet_is_real_dev_registered(real_dev))
+	real_dev = priv->real_dev;
+	if (!rmnet_is_real_dev_registered(real_dev))
 		return -ENODEV;
 
 	port = rmnet_get_port_rtnl(real_dev);

From 102210f7664442d8c0ce332c006ea90626df745b Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:24:45 +0000
Subject: [PATCH 229/239] net: rmnet: fix suspicious RCU usage

rmnet_get_port() internally calls rcu_dereference_rtnl(),
which checks RTNL.
But rmnet_get_port() could be called by packet path.
The packet path is not protected by RTNL.
So, the suspicious RCU usage problem occurs.

Test commands:
    modprobe rmnet
    ip netns add nst
    ip link add veth0 type veth peer name veth1
    ip link set veth1 netns nst
    ip link add rmnet0 link veth0 type rmnet mux_id 1
    ip netns exec nst ip link add rmnet1 link veth1 type rmnet mux_id 1
    ip netns exec nst ip link set veth1 up
    ip netns exec nst ip link set rmnet1 up
    ip netns exec nst ip a a 192.168.100.2/24 dev rmnet1
    ip link set veth0 up
    ip link set rmnet0 up
    ip a a 192.168.100.1/24 dev rmnet0
    ping 192.168.100.2

Splat looks like:
[  146.630958][ T1174] WARNING: suspicious RCU usage
[  146.631735][ T1174] 5.6.0-rc1+ #447 Not tainted
[  146.632387][ T1174] -----------------------------
[  146.633151][ T1174] drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c:386 suspicious rcu_dereference_check() !
[  146.634742][ T1174]
[  146.634742][ T1174] other info that might help us debug this:
[  146.634742][ T1174]
[  146.645992][ T1174]
[  146.645992][ T1174] rcu_scheduler_active = 2, debug_locks = 1
[  146.646937][ T1174] 5 locks held by ping/1174:
[  146.647609][ T1174]  #0: ffff8880c31dea70 (sk_lock-AF_INET){+.+.}, at: raw_sendmsg+0xab8/0x2980
[  146.662463][ T1174]  #1: ffffffff93925660 (rcu_read_lock_bh){....}, at: ip_finish_output2+0x243/0x2150
[  146.671696][ T1174]  #2: ffffffff93925660 (rcu_read_lock_bh){....}, at: __dev_queue_xmit+0x213/0x2940
[  146.673064][ T1174]  #3: ffff8880c19ecd58 (&dev->qdisc_running_key#7){+...}, at: ip_finish_output2+0x714/0x2150
[  146.690358][ T1174]  #4: ffff8880c5796898 (&dev->qdisc_xmit_lock_key#3){+.-.}, at: sch_direct_xmit+0x1e2/0x1020
[  146.699875][ T1174]
[  146.699875][ T1174] stack backtrace:
[  146.701091][ T1174] CPU: 0 PID: 1174 Comm: ping Not tainted 5.6.0-rc1+ #447
[  146.705215][ T1174] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[  146.706565][ T1174] Call Trace:
[  146.707102][ T1174]  dump_stack+0x96/0xdb
[  146.708007][ T1174]  rmnet_get_port.part.9+0x76/0x80 [rmnet]
[  146.709233][ T1174]  rmnet_egress_handler+0x107/0x420 [rmnet]
[  146.710492][ T1174]  ? sch_direct_xmit+0x1e2/0x1020
[  146.716193][ T1174]  rmnet_vnd_start_xmit+0x3d/0xa0 [rmnet]
[  146.717012][ T1174]  dev_hard_start_xmit+0x160/0x740
[  146.717854][ T1174]  sch_direct_xmit+0x265/0x1020
[  146.718577][ T1174]  ? register_lock_class+0x14d0/0x14d0
[  146.719429][ T1174]  ? dev_watchdog+0xac0/0xac0
[  146.723738][ T1174]  ? __dev_queue_xmit+0x15fd/0x2940
[  146.724469][ T1174]  ? lock_acquire+0x164/0x3b0
[  146.725172][ T1174]  __dev_queue_xmit+0x20c7/0x2940
[ ... ]

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c  | 13 ++++++-------
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h  |  2 +-
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c    |  4 ++--
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index ac58f584190b..fc68ecdd804b 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -382,11 +382,10 @@ struct rtnl_link_ops rmnet_link_ops __read_mostly = {
 	.fill_info	= rmnet_fill_info,
 };
 
-/* Needs either rcu_read_lock() or rtnl lock */
-struct rmnet_port *rmnet_get_port(struct net_device *real_dev)
+struct rmnet_port *rmnet_get_port_rcu(struct net_device *real_dev)
 {
 	if (rmnet_is_real_dev_registered(real_dev))
-		return rcu_dereference_rtnl(real_dev->rx_handler_data);
+		return rcu_dereference_bh(real_dev->rx_handler_data);
 	else
 		return NULL;
 }
@@ -412,7 +411,7 @@ int rmnet_add_bridge(struct net_device *rmnet_dev,
 	struct rmnet_port *port, *slave_port;
 	int err;
 
-	port = rmnet_get_port(real_dev);
+	port = rmnet_get_port_rtnl(real_dev);
 
 	/* If there is more than one rmnet dev attached, its probably being
 	 * used for muxing. Skip the briding in that case
@@ -427,7 +426,7 @@ int rmnet_add_bridge(struct net_device *rmnet_dev,
 	if (err)
 		return -EBUSY;
 
-	slave_port = rmnet_get_port(slave_dev);
+	slave_port = rmnet_get_port_rtnl(slave_dev);
 	slave_port->rmnet_mode = RMNET_EPMODE_BRIDGE;
 	slave_port->bridge_ep = real_dev;
 
@@ -445,11 +444,11 @@ int rmnet_del_bridge(struct net_device *rmnet_dev,
 	struct net_device *real_dev = priv->real_dev;
 	struct rmnet_port *port, *slave_port;
 
-	port = rmnet_get_port(real_dev);
+	port = rmnet_get_port_rtnl(real_dev);
 	port->rmnet_mode = RMNET_EPMODE_VND;
 	port->bridge_ep = NULL;
 
-	slave_port = rmnet_get_port(slave_dev);
+	slave_port = rmnet_get_port_rtnl(slave_dev);
 	rmnet_unregister_real_device(slave_dev, slave_port);
 
 	netdev_dbg(slave_dev, "removed from rmnet as slave\n");
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index cd0a6bcbe74a..0d568dcfd65a 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -65,7 +65,7 @@ struct rmnet_priv {
 	struct rmnet_priv_stats stats;
 };
 
-struct rmnet_port *rmnet_get_port(struct net_device *real_dev);
+struct rmnet_port *rmnet_get_port_rcu(struct net_device *real_dev);
 struct rmnet_endpoint *rmnet_get_endpoint(struct rmnet_port *port, u8 mux_id);
 int rmnet_add_bridge(struct net_device *rmnet_dev,
 		     struct net_device *slave_dev,
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 1b74bc160402..074a8b326c30 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -184,7 +184,7 @@ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb)
 		return RX_HANDLER_PASS;
 
 	dev = skb->dev;
-	port = rmnet_get_port(dev);
+	port = rmnet_get_port_rcu(dev);
 
 	switch (port->rmnet_mode) {
 	case RMNET_EPMODE_VND:
@@ -217,7 +217,7 @@ void rmnet_egress_handler(struct sk_buff *skb)
 	skb->dev = priv->real_dev;
 	mux_id = priv->mux_id;
 
-	port = rmnet_get_port(skb->dev);
+	port = rmnet_get_port_rcu(skb->dev);
 	if (!port)
 		goto drop;
 

From c026d970102e9af9958edefb4a015702c6aab636 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:25:05 +0000
Subject: [PATCH 230/239] net: rmnet: remove rcu_read_lock in
 rmnet_force_unassociate_device()

The notifier_call() of the slave interface removes rmnet interface with
unregister_netdevice_queue().
But, before calling unregister_netdevice_queue(), it acquires
rcu readlock.
In the RCU critical section, sleeping isn't be allowed.
But, unregister_netdevice_queue() internally calls synchronize_net(),
which would sleep.
So, suspicious RCU usage warning occurs.

Test commands:
    modprobe rmnet
    ip link add dummy0 type dummy
    ip link add dummy1 type dummy
    ip link add rmnet0 link dummy0 type rmnet mux_id 1
    ip link set dummy1 master rmnet0
    ip link del dummy0

Splat looks like:
[   79.639245][ T1195] =============================
[   79.640134][ T1195] WARNING: suspicious RCU usage
[   79.640852][ T1195] 5.6.0-rc1+ #447 Not tainted
[   79.641657][ T1195] -----------------------------
[   79.642472][ T1195] ./include/linux/rcupdate.h:273 Illegal context switch in RCU read-side critical section!
[   79.644043][ T1195]
[   79.644043][ T1195] other info that might help us debug this:
[   79.644043][ T1195]
[   79.645682][ T1195]
[   79.645682][ T1195] rcu_scheduler_active = 2, debug_locks = 1
[   79.646980][ T1195] 2 locks held by ip/1195:
[   79.647629][ T1195]  #0: ffffffffa3cf64f0 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x457/0x890
[   79.649312][ T1195]  #1: ffffffffa39256c0 (rcu_read_lock){....}, at: rmnet_config_notify_cb+0xf0/0x590 [rmnet]
[   79.651717][ T1195]
[   79.651717][ T1195] stack backtrace:
[   79.652650][ T1195] CPU: 3 PID: 1195 Comm: ip Not tainted 5.6.0-rc1+ #447
[   79.653702][ T1195] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   79.655037][ T1195] Call Trace:
[   79.655560][ T1195]  dump_stack+0x96/0xdb
[   79.656252][ T1195]  ___might_sleep+0x345/0x440
[   79.656994][ T1195]  synchronize_net+0x18/0x30
[   79.661132][ T1195]  netdev_rx_handler_unregister+0x40/0xb0
[   79.666266][ T1195]  rmnet_unregister_real_device+0x42/0xb0 [rmnet]
[   79.667211][ T1195]  rmnet_config_notify_cb+0x1f7/0x590 [rmnet]
[   79.668121][ T1195]  ? rmnet_unregister_bridge.isra.6+0xf0/0xf0 [rmnet]
[   79.669166][ T1195]  ? rmnet_unregister_bridge.isra.6+0xf0/0xf0 [rmnet]
[   79.670286][ T1195]  ? __module_text_address+0x13/0x140
[   79.671139][ T1195]  notifier_call_chain+0x90/0x160
[   79.671973][ T1195]  rollback_registered_many+0x660/0xcf0
[   79.672893][ T1195]  ? netif_set_real_num_tx_queues+0x780/0x780
[   79.675091][ T1195]  ? __lock_acquire+0xdfe/0x3de0
[   79.675825][ T1195]  ? memset+0x1f/0x40
[   79.676367][ T1195]  ? __nla_validate_parse+0x98/0x1ab0
[   79.677290][ T1195]  unregister_netdevice_many.part.133+0x13/0x1b0
[   79.678163][ T1195]  rtnl_delete_link+0xbc/0x100
[ ... ]

Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index fc68ecdd804b..0ad64aa66592 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -230,7 +230,6 @@ static void rmnet_force_unassociate_device(struct net_device *dev)
 
 	port = rmnet_get_port_rtnl(dev);
 
-	rcu_read_lock();
 	rmnet_unregister_bridge(dev, port);
 
 	hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) {
@@ -241,7 +240,6 @@ static void rmnet_force_unassociate_device(struct net_device *dev)
 		kfree(ep);
 	}
 
-	rcu_read_unlock();
 	unregister_netdevice_many(&list);
 
 	rmnet_unregister_real_device(real_dev, port);

From 1dc49e9d164cd7e11c81279c83db84a147e14740 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:25:19 +0000
Subject: [PATCH 231/239] net: rmnet: do not allow to change mux id if mux id
 is duplicated

Basically, duplicate mux id isn't be allowed.
So, the creation of rmnet will be failed if there is duplicate mux id
is existing.
But, changelink routine doesn't check duplicate mux id.

Test commands:
    modprobe rmnet
    ip link add dummy0 type dummy
    ip link add rmnet0 link dummy0 type rmnet mux_id 1
    ip link add rmnet1 link dummy0 type rmnet mux_id 2
    ip link set rmnet1 type rmnet mux_id 1

Fixes: 23790ef12082 ("net: qualcomm: rmnet: Allow to configure flags for existing devices")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 0ad64aa66592..3c0e6d24d083 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -306,6 +306,10 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
 
 	if (data[IFLA_RMNET_MUX_ID]) {
 		mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]);
+		if (rmnet_get_endpoint(port, mux_id)) {
+			NL_SET_ERR_MSG_MOD(extack, "MUX ID already exists");
+			return -EINVAL;
+		}
 		ep = rmnet_get_endpoint(port, priv->mux_id);
 		if (!ep)
 			return -ENODEV;

From 037f9cdf72fb8a7ff9ec2b5dd05336ec1492bdf1 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:25:43 +0000
Subject: [PATCH 232/239] net: rmnet: use upper/lower device infrastructure

netdev_upper_dev_link() is useful to manage lower/upper interfaces.
And this function internally validates looping, maximum depth.
All or most virtual interfaces that could have a real interface
(e.g. macsec, macvlan, ipvlan etc.) use lower/upper infrastructure.

Test commands:
    modprobe rmnet
    ip link add dummy0 type dummy
    ip link add rmnet1 link dummy0 type rmnet mux_id 1
    for i in {2..100}
    do
        let A=$i-1
        ip link add rmnet$i link rmnet$A type rmnet mux_id $i
    done
    ip link del dummy0

The purpose of the test commands is to make stack overflow.

Splat looks like:
[   52.411438][ T1395] BUG: KASAN: slab-out-of-bounds in find_busiest_group+0x27e/0x2c00
[   52.413218][ T1395] Write of size 64 at addr ffff8880c774bde0 by task ip/1395
[   52.414841][ T1395]
[   52.430720][ T1395] CPU: 1 PID: 1395 Comm: ip Not tainted 5.6.0-rc1+ #447
[   52.496511][ T1395] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   52.513597][ T1395] Call Trace:
[   52.546516][ T1395]
[   52.558773][ T1395] Allocated by task 3171537984:
[   52.588290][ T1395] BUG: unable to handle page fault for address: ffffffffb999e260
[   52.589311][ T1395] #PF: supervisor read access in kernel mode
[   52.590529][ T1395] #PF: error_code(0x0000) - not-present page
[   52.591374][ T1395] PGD d6818067 P4D d6818067 PUD d6819063 PMD 0
[   52.592288][ T1395] Thread overran stack, or stack corrupted
[   52.604980][ T1395] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI
[   52.605856][ T1395] CPU: 1 PID: 1395 Comm: ip Not tainted 5.6.0-rc1+ #447
[   52.611764][ T1395] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   52.621520][ T1395] RIP: 0010:stack_depot_fetch+0x10/0x30
[   52.622296][ T1395] Code: ff e9 f9 fe ff ff 48 89 df e8 9c 1d 91 ff e9 ca fe ff ff cc cc cc cc cc cc cc 89 f8 0
[   52.627887][ T1395] RSP: 0018:ffff8880c774bb60 EFLAGS: 00010006
[   52.628735][ T1395] RAX: 00000000001f8880 RBX: ffff8880c774d140 RCX: 0000000000000000
[   52.631773][ T1395] RDX: 000000000000001d RSI: ffff8880c774bb68 RDI: 0000000000003ff0
[   52.649584][ T1395] RBP: ffffea00031dd200 R08: ffffed101b43e403 R09: ffffed101b43e403
[   52.674857][ T1395] R10: 0000000000000001 R11: ffffed101b43e402 R12: ffff8880d900e5c0
[   52.678257][ T1395] R13: ffff8880c774c000 R14: 0000000000000000 R15: dffffc0000000000
[   52.694541][ T1395] FS:  00007fe867f6e0c0(0000) GS:ffff8880da000000(0000) knlGS:0000000000000000
[   52.764039][ T1395] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   52.815008][ T1395] CR2: ffffffffb999e260 CR3: 00000000c26aa005 CR4: 00000000000606e0
[   52.862312][ T1395] Call Trace:
[   52.887133][ T1395] Modules linked in: dummy rmnet veth openvswitch nsh nf_conncount nf_nat nf_conntrack nf_dex
[   52.936749][ T1395] CR2: ffffffffb999e260
[   52.965695][ T1395] ---[ end trace 7e32ca99482dbb31 ]---
[   52.966556][ T1395] RIP: 0010:stack_depot_fetch+0x10/0x30
[   52.971083][ T1395] Code: ff e9 f9 fe ff ff 48 89 df e8 9c 1d 91 ff e9 ca fe ff ff cc cc cc cc cc cc cc 89 f8 0
[   53.003650][ T1395] RSP: 0018:ffff8880c774bb60 EFLAGS: 00010006
[   53.043183][ T1395] RAX: 00000000001f8880 RBX: ffff8880c774d140 RCX: 0000000000000000
[   53.076480][ T1395] RDX: 000000000000001d RSI: ffff8880c774bb68 RDI: 0000000000003ff0
[   53.093858][ T1395] RBP: ffffea00031dd200 R08: ffffed101b43e403 R09: ffffed101b43e403
[   53.112795][ T1395] R10: 0000000000000001 R11: ffffed101b43e402 R12: ffff8880d900e5c0
[   53.139837][ T1395] R13: ffff8880c774c000 R14: 0000000000000000 R15: dffffc0000000000
[   53.141500][ T1395] FS:  00007fe867f6e0c0(0000) GS:ffff8880da000000(0000) knlGS:0000000000000000
[   53.143343][ T1395] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   53.152007][ T1395] CR2: ffffffffb999e260 CR3: 00000000c26aa005 CR4: 00000000000606e0
[   53.156459][ T1395] Kernel panic - not syncing: Fatal exception
[   54.213570][ T1395] Shutting down cpus with NMI
[   54.354112][ T1395] Kernel Offset: 0x33000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0x)
[   54.355687][ T1395] Rebooting in 5 seconds..

Fixes: b37f78f234bf ("net: qualcomm: rmnet: Fix crash on real dev unregistration")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/qualcomm/rmnet/rmnet_config.c    | 35 +++++++++----------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 3c0e6d24d083..e3fbf2331b96 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -61,9 +61,6 @@ static int rmnet_unregister_real_device(struct net_device *real_dev,
 
 	kfree(port);
 
-	/* release reference on real_dev */
-	dev_put(real_dev);
-
 	netdev_dbg(real_dev, "Removed from rmnet\n");
 	return 0;
 }
@@ -89,9 +86,6 @@ static int rmnet_register_real_device(struct net_device *real_dev)
 		return -EBUSY;
 	}
 
-	/* hold on to real dev for MAP data */
-	dev_hold(real_dev);
-
 	for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++)
 		INIT_HLIST_HEAD(&port->muxed_ep[entry]);
 
@@ -162,6 +156,10 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 	if (err)
 		goto err1;
 
+	err = netdev_upper_dev_link(real_dev, dev, extack);
+	if (err < 0)
+		goto err2;
+
 	port->rmnet_mode = mode;
 
 	hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]);
@@ -178,6 +176,8 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 
 	return 0;
 
+err2:
+	unregister_netdevice(dev);
 err1:
 	rmnet_unregister_real_device(real_dev, port);
 err0:
@@ -209,33 +209,30 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head)
 		rmnet_vnd_dellink(mux_id, port, ep);
 		kfree(ep);
 	}
+	netdev_upper_dev_unlink(real_dev, dev);
 	rmnet_unregister_real_device(real_dev, port);
 
 	unregister_netdevice_queue(dev, head);
 }
 
-static void rmnet_force_unassociate_device(struct net_device *dev)
+static void rmnet_force_unassociate_device(struct net_device *real_dev)
 {
-	struct net_device *real_dev = dev;
 	struct hlist_node *tmp_ep;
 	struct rmnet_endpoint *ep;
 	struct rmnet_port *port;
 	unsigned long bkt_ep;
 	LIST_HEAD(list);
 
-	if (!rmnet_is_real_dev_registered(real_dev))
-		return;
-
 	ASSERT_RTNL();
 
-	port = rmnet_get_port_rtnl(dev);
+	port = rmnet_get_port_rtnl(real_dev);
 
-	rmnet_unregister_bridge(dev, port);
+	rmnet_unregister_bridge(real_dev, port);
 
 	hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) {
+		netdev_upper_dev_unlink(real_dev, ep->egress_dev);
 		unregister_netdevice_queue(ep->egress_dev, &list);
 		rmnet_vnd_dellink(ep->mux_id, port, ep);
-
 		hlist_del_init_rcu(&ep->hlnode);
 		kfree(ep);
 	}
@@ -248,15 +245,15 @@ static void rmnet_force_unassociate_device(struct net_device *dev)
 static int rmnet_config_notify_cb(struct notifier_block *nb,
 				  unsigned long event, void *data)
 {
-	struct net_device *dev = netdev_notifier_info_to_dev(data);
+	struct net_device *real_dev = netdev_notifier_info_to_dev(data);
 
-	if (!dev)
+	if (!rmnet_is_real_dev_registered(real_dev))
 		return NOTIFY_DONE;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
-		netdev_dbg(dev, "Kernel unregister\n");
-		rmnet_force_unassociate_device(dev);
+		netdev_dbg(real_dev, "Kernel unregister\n");
+		rmnet_force_unassociate_device(real_dev);
 		break;
 
 	default:
@@ -477,8 +474,8 @@ static int __init rmnet_init(void)
 
 static void __exit rmnet_exit(void)
 {
-	unregister_netdevice_notifier(&rmnet_dev_notifier);
 	rtnl_link_unregister(&rmnet_link_ops);
+	unregister_netdevice_notifier(&rmnet_dev_notifier);
 }
 
 module_init(rmnet_init)

From d939b6d30bea1a2322bc536b12be0a7c4c2bccd7 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:26:02 +0000
Subject: [PATCH 233/239] net: rmnet: fix bridge mode bugs

In order to attach a bridge interface to the rmnet interface,
"master" operation is used.
(e.g. ip link set dummy1 master rmnet0)
But, in the rmnet_add_bridge(), which is a callback of ->ndo_add_slave()
doesn't register lower interface.
So, ->ndo_del_slave() doesn't work.
There are other problems too.
1. It couldn't detect circular upper/lower interface relationship.
2. It couldn't prevent stack overflow because of too deep depth
of upper/lower interface
3. It doesn't check the number of lower interfaces.
4. Panics because of several reasons.

The root problem of these issues is actually the same.
So, in this patch, these all problems will be fixed.

Test commands:
    modprobe rmnet
    ip link add dummy0 type dummy
    ip link add rmnet0 link dummy0 type rmnet mux_id 1
    ip link add dummy1 master rmnet0 type dummy
    ip link add dummy2 master rmnet0 type dummy
    ip link del rmnet0
    ip link del dummy2
    ip link del dummy1

Splat looks like:
[   41.867595][ T1164] general protection fault, probably for non-canonical address 0xdffffc0000000101I
[   41.869993][ T1164] KASAN: null-ptr-deref in range [0x0000000000000808-0x000000000000080f]
[   41.872950][ T1164] CPU: 0 PID: 1164 Comm: ip Not tainted 5.6.0-rc1+ #447
[   41.873915][ T1164] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   41.875161][ T1164] RIP: 0010:rmnet_unregister_bridge.isra.6+0x71/0xf0 [rmnet]
[   41.876178][ T1164] Code: 48 89 ef 48 89 c6 5b 5d e9 fc fe ff ff e8 f7 f3 ff ff 48 8d b8 08 08 00 00 48 ba 00 7
[   41.878925][ T1164] RSP: 0018:ffff8880c4d0f188 EFLAGS: 00010202
[   41.879774][ T1164] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000101
[   41.887689][ T1164] RDX: dffffc0000000000 RSI: ffffffffb8cf64f0 RDI: 0000000000000808
[   41.888727][ T1164] RBP: ffff8880c40e4000 R08: ffffed101b3c0e3c R09: 0000000000000001
[   41.889749][ T1164] R10: 0000000000000001 R11: ffffed101b3c0e3b R12: 1ffff110189a1e3c
[   41.890783][ T1164] R13: ffff8880c4d0f200 R14: ffffffffb8d56160 R15: ffff8880ccc2c000
[   41.891794][ T1164] FS:  00007f4300edc0c0(0000) GS:ffff8880d9c00000(0000) knlGS:0000000000000000
[   41.892953][ T1164] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   41.893800][ T1164] CR2: 00007f43003bc8c0 CR3: 00000000ca53e001 CR4: 00000000000606f0
[   41.894824][ T1164] Call Trace:
[   41.895274][ T1164]  ? rcu_is_watching+0x2c/0x80
[   41.895895][ T1164]  rmnet_config_notify_cb+0x1f7/0x590 [rmnet]
[   41.896687][ T1164]  ? rmnet_unregister_bridge.isra.6+0xf0/0xf0 [rmnet]
[   41.897611][ T1164]  ? rmnet_unregister_bridge.isra.6+0xf0/0xf0 [rmnet]
[   41.898508][ T1164]  ? __module_text_address+0x13/0x140
[   41.899162][ T1164]  notifier_call_chain+0x90/0x160
[   41.899814][ T1164]  rollback_registered_many+0x660/0xcf0
[   41.900544][ T1164]  ? netif_set_real_num_tx_queues+0x780/0x780
[   41.901316][ T1164]  ? __lock_acquire+0xdfe/0x3de0
[   41.901958][ T1164]  ? memset+0x1f/0x40
[   41.902468][ T1164]  ? __nla_validate_parse+0x98/0x1ab0
[   41.903166][ T1164]  unregister_netdevice_many.part.133+0x13/0x1b0
[   41.903988][ T1164]  rtnl_delete_link+0xbc/0x100
[ ... ]

Fixes: 60d58f971c10 ("net: qualcomm: rmnet: Implement bridge mode")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../ethernet/qualcomm/rmnet/rmnet_config.c    | 131 +++++++++---------
 .../ethernet/qualcomm/rmnet/rmnet_config.h    |   1 +
 .../net/ethernet/qualcomm/rmnet/rmnet_vnd.c   |   8 --
 .../net/ethernet/qualcomm/rmnet/rmnet_vnd.h   |   1 -
 4 files changed, 64 insertions(+), 77 deletions(-)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index e3fbf2331b96..fbf4cbcf1a65 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -13,25 +13,6 @@
 #include "rmnet_vnd.h"
 #include "rmnet_private.h"
 
-/* Locking scheme -
- * The shared resource which needs to be protected is realdev->rx_handler_data.
- * For the writer path, this is using rtnl_lock(). The writer paths are
- * rmnet_newlink(), rmnet_dellink() and rmnet_force_unassociate_device(). These
- * paths are already called with rtnl_lock() acquired in. There is also an
- * ASSERT_RTNL() to ensure that we are calling with rtnl acquired. For
- * dereference here, we will need to use rtnl_dereference(). Dev list writing
- * needs to happen with rtnl_lock() acquired for netdev_master_upper_dev_link().
- * For the reader path, the real_dev->rx_handler_data is called in the TX / RX
- * path. We only need rcu_read_lock() for these scenarios. In these cases,
- * the rcu_read_lock() is held in __dev_queue_xmit() and
- * netif_receive_skb_internal(), so readers need to use rcu_dereference_rtnl()
- * to get the relevant information. For dev list reading, we again acquire
- * rcu_read_lock() in rmnet_dellink() for netdev_master_upper_dev_get_rcu().
- * We also use unregister_netdevice_many() to free all rmnet devices in
- * rmnet_force_unassociate_device() so we dont lose the rtnl_lock() and free in
- * same context.
- */
-
 /* Local Definitions and Declarations */
 
 static const struct nla_policy rmnet_policy[IFLA_RMNET_MAX + 1] = {
@@ -51,9 +32,10 @@ rmnet_get_port_rtnl(const struct net_device *real_dev)
 	return rtnl_dereference(real_dev->rx_handler_data);
 }
 
-static int rmnet_unregister_real_device(struct net_device *real_dev,
-					struct rmnet_port *port)
+static int rmnet_unregister_real_device(struct net_device *real_dev)
 {
+	struct rmnet_port *port = rmnet_get_port_rtnl(real_dev);
+
 	if (port->nr_rmnet_devs)
 		return -EINVAL;
 
@@ -93,28 +75,33 @@ static int rmnet_register_real_device(struct net_device *real_dev)
 	return 0;
 }
 
-static void rmnet_unregister_bridge(struct net_device *dev,
-				    struct rmnet_port *port)
+static void rmnet_unregister_bridge(struct rmnet_port *port)
 {
-	struct rmnet_port *bridge_port;
-	struct net_device *bridge_dev;
+	struct net_device *bridge_dev, *real_dev, *rmnet_dev;
+	struct rmnet_port *real_port;
 
 	if (port->rmnet_mode != RMNET_EPMODE_BRIDGE)
 		return;
 
-	/* bridge slave handling */
+	rmnet_dev = port->rmnet_dev;
 	if (!port->nr_rmnet_devs) {
-		bridge_dev = port->bridge_ep;
+		/* bridge device */
+		real_dev = port->bridge_ep;
+		bridge_dev = port->dev;
 
-		bridge_port = rmnet_get_port_rtnl(bridge_dev);
-		bridge_port->bridge_ep = NULL;
-		bridge_port->rmnet_mode = RMNET_EPMODE_VND;
+		real_port = rmnet_get_port_rtnl(real_dev);
+		real_port->bridge_ep = NULL;
+		real_port->rmnet_mode = RMNET_EPMODE_VND;
 	} else {
+		/* real device */
 		bridge_dev = port->bridge_ep;
 
-		bridge_port = rmnet_get_port_rtnl(bridge_dev);
-		rmnet_unregister_real_device(bridge_dev, bridge_port);
+		port->bridge_ep = NULL;
+		port->rmnet_mode = RMNET_EPMODE_VND;
 	}
+
+	netdev_upper_dev_unlink(bridge_dev, rmnet_dev);
+	rmnet_unregister_real_device(bridge_dev);
 }
 
 static int rmnet_newlink(struct net *src_net, struct net_device *dev,
@@ -161,6 +148,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 		goto err2;
 
 	port->rmnet_mode = mode;
+	port->rmnet_dev = dev;
 
 	hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]);
 
@@ -178,8 +166,9 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 
 err2:
 	unregister_netdevice(dev);
+	rmnet_vnd_dellink(mux_id, port, ep);
 err1:
-	rmnet_unregister_real_device(real_dev, port);
+	rmnet_unregister_real_device(real_dev);
 err0:
 	kfree(ep);
 	return err;
@@ -188,30 +177,32 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev,
 static void rmnet_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct rmnet_priv *priv = netdev_priv(dev);
-	struct net_device *real_dev;
+	struct net_device *real_dev, *bridge_dev;
+	struct rmnet_port *real_port, *bridge_port;
 	struct rmnet_endpoint *ep;
-	struct rmnet_port *port;
-	u8 mux_id;
+	u8 mux_id = priv->mux_id;
 
 	real_dev = priv->real_dev;
 
-	if (!real_dev || !rmnet_is_real_dev_registered(real_dev))
+	if (!rmnet_is_real_dev_registered(real_dev))
 		return;
 
-	port = rmnet_get_port_rtnl(real_dev);
+	real_port = rmnet_get_port_rtnl(real_dev);
+	bridge_dev = real_port->bridge_ep;
+	if (bridge_dev) {
+		bridge_port = rmnet_get_port_rtnl(bridge_dev);
+		rmnet_unregister_bridge(bridge_port);
+	}
 
-	mux_id = rmnet_vnd_get_mux(dev);
-
-	ep = rmnet_get_endpoint(port, mux_id);
+	ep = rmnet_get_endpoint(real_port, mux_id);
 	if (ep) {
 		hlist_del_init_rcu(&ep->hlnode);
-		rmnet_unregister_bridge(dev, port);
-		rmnet_vnd_dellink(mux_id, port, ep);
+		rmnet_vnd_dellink(mux_id, real_port, ep);
 		kfree(ep);
 	}
-	netdev_upper_dev_unlink(real_dev, dev);
-	rmnet_unregister_real_device(real_dev, port);
 
+	netdev_upper_dev_unlink(real_dev, dev);
+	rmnet_unregister_real_device(real_dev);
 	unregister_netdevice_queue(dev, head);
 }
 
@@ -223,23 +214,23 @@ static void rmnet_force_unassociate_device(struct net_device *real_dev)
 	unsigned long bkt_ep;
 	LIST_HEAD(list);
 
-	ASSERT_RTNL();
-
 	port = rmnet_get_port_rtnl(real_dev);
 
-	rmnet_unregister_bridge(real_dev, port);
-
-	hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) {
-		netdev_upper_dev_unlink(real_dev, ep->egress_dev);
-		unregister_netdevice_queue(ep->egress_dev, &list);
-		rmnet_vnd_dellink(ep->mux_id, port, ep);
-		hlist_del_init_rcu(&ep->hlnode);
-		kfree(ep);
+	if (port->nr_rmnet_devs) {
+		/* real device */
+		rmnet_unregister_bridge(port);
+		hash_for_each_safe(port->muxed_ep, bkt_ep, tmp_ep, ep, hlnode) {
+			unregister_netdevice_queue(ep->egress_dev, &list);
+			netdev_upper_dev_unlink(real_dev, ep->egress_dev);
+			rmnet_vnd_dellink(ep->mux_id, port, ep);
+			hlist_del_init_rcu(&ep->hlnode);
+			kfree(ep);
+		}
+		rmnet_unregister_real_device(real_dev);
+		unregister_netdevice_many(&list);
+	} else {
+		rmnet_unregister_bridge(port);
 	}
-
-	unregister_netdevice_many(&list);
-
-	rmnet_unregister_real_device(real_dev, port);
 }
 
 static int rmnet_config_notify_cb(struct notifier_block *nb,
@@ -418,6 +409,9 @@ int rmnet_add_bridge(struct net_device *rmnet_dev,
 	if (port->nr_rmnet_devs > 1)
 		return -EINVAL;
 
+	if (port->rmnet_mode != RMNET_EPMODE_VND)
+		return -EINVAL;
+
 	if (rmnet_is_real_dev_registered(slave_dev))
 		return -EBUSY;
 
@@ -425,9 +419,17 @@ int rmnet_add_bridge(struct net_device *rmnet_dev,
 	if (err)
 		return -EBUSY;
 
+	err = netdev_master_upper_dev_link(slave_dev, rmnet_dev, NULL, NULL,
+					   extack);
+	if (err) {
+		rmnet_unregister_real_device(slave_dev);
+		return err;
+	}
+
 	slave_port = rmnet_get_port_rtnl(slave_dev);
 	slave_port->rmnet_mode = RMNET_EPMODE_BRIDGE;
 	slave_port->bridge_ep = real_dev;
+	slave_port->rmnet_dev = rmnet_dev;
 
 	port->rmnet_mode = RMNET_EPMODE_BRIDGE;
 	port->bridge_ep = slave_dev;
@@ -439,16 +441,9 @@ int rmnet_add_bridge(struct net_device *rmnet_dev,
 int rmnet_del_bridge(struct net_device *rmnet_dev,
 		     struct net_device *slave_dev)
 {
-	struct rmnet_priv *priv = netdev_priv(rmnet_dev);
-	struct net_device *real_dev = priv->real_dev;
-	struct rmnet_port *port, *slave_port;
+	struct rmnet_port *port = rmnet_get_port_rtnl(slave_dev);
 
-	port = rmnet_get_port_rtnl(real_dev);
-	port->rmnet_mode = RMNET_EPMODE_VND;
-	port->bridge_ep = NULL;
-
-	slave_port = rmnet_get_port_rtnl(slave_dev);
-	rmnet_unregister_real_device(slave_dev, slave_port);
+	rmnet_unregister_bridge(port);
 
 	netdev_dbg(slave_dev, "removed from rmnet as slave\n");
 	return 0;
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
index 0d568dcfd65a..be515982d628 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
@@ -28,6 +28,7 @@ struct rmnet_port {
 	u8 rmnet_mode;
 	struct hlist_head muxed_ep[RMNET_MAX_LOGICAL_EP];
 	struct net_device *bridge_ep;
+	struct net_device *rmnet_dev;
 };
 
 extern struct rtnl_link_ops rmnet_link_ops;
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
index 509dfc895a33..26ad40f19c64 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
@@ -266,14 +266,6 @@ int rmnet_vnd_dellink(u8 id, struct rmnet_port *port,
 	return 0;
 }
 
-u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev)
-{
-	struct rmnet_priv *priv;
-
-	priv = netdev_priv(rmnet_dev);
-	return priv->mux_id;
-}
-
 int rmnet_vnd_do_flow_control(struct net_device *rmnet_dev, int enable)
 {
 	netdev_dbg(rmnet_dev, "Setting VND TX queue state to %d\n", enable);
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
index 54cbaf3c3bc4..14d77c709d4a 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h
@@ -16,6 +16,5 @@ int rmnet_vnd_dellink(u8 id, struct rmnet_port *port,
 		      struct rmnet_endpoint *ep);
 void rmnet_vnd_rx_fixup(struct sk_buff *skb, struct net_device *dev);
 void rmnet_vnd_tx_fixup(struct sk_buff *skb, struct net_device *dev);
-u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev);
 void rmnet_vnd_setup(struct net_device *dev);
 #endif /* _RMNET_VND_H_ */

From ad3cc31b599ea80f06b29ebdc18b3a39878a48d6 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 27 Feb 2020 12:26:15 +0000
Subject: [PATCH 234/239] net: rmnet: fix packet forwarding in rmnet bridge
 mode

Packet forwarding is not working in rmnet bridge mode.
Because when a packet is forwarded, skb_push() for an ethernet header
is needed. But it doesn't call skb_push().
So, the ethernet header will be lost.

Test commands:
    modprobe rmnet
    ip netns add nst
    ip netns add nst2
    ip link add veth0 type veth peer name veth1
    ip link add veth2 type veth peer name veth3
    ip link set veth1 netns nst
    ip link set veth3 netns nst2

    ip link add rmnet0 link veth0 type rmnet mux_id 1
    ip link set veth2 master rmnet0
    ip link set veth0 up
    ip link set veth2 up
    ip link set rmnet0 up
    ip a a 192.168.100.1/24 dev rmnet0

    ip netns exec nst ip link set veth1 up
    ip netns exec nst ip a a 192.168.100.2/24 dev veth1
    ip netns exec nst2 ip link set veth3 up
    ip netns exec nst2 ip a a 192.168.100.3/24 dev veth3
    ip netns exec nst2 ping 192.168.100.2

Fixes: 60d58f971c10 ("net: qualcomm: rmnet: Implement bridge mode")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
index 074a8b326c30..29a7bfa2584d 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
@@ -159,6 +159,9 @@ static int rmnet_map_egress_handler(struct sk_buff *skb,
 static void
 rmnet_bridge_handler(struct sk_buff *skb, struct net_device *bridge_dev)
 {
+	if (skb_mac_header_was_set(skb))
+		skb_push(skb, skb->mac_len);
+
 	if (bridge_dev) {
 		skb->dev = bridge_dev;
 		dev_queue_xmit(skb);

From 5c05a164d441a1792791175e4959ea9df12f7e2b Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 27 Feb 2020 11:52:35 -0800
Subject: [PATCH 235/239] unix: It's CONFIG_PROC_FS not CONFIG_PROCFS

Fixes: 3a12500ed5dd ("unix: define and set show_fdinfo only if procfs is enabled")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index aa6e2530e1ec..68debcb28fa4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -682,7 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val)
 	return 0;
 }
 
-#ifdef CONFIG_PROCFS
+#ifdef CONFIG_PROC_FS
 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
 {
 	struct sock *sk = sock->sk;

From 3f74957fcbeab703297ed0f135430414ed7e0dd0 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 26 Feb 2020 11:58:18 +0100
Subject: [PATCH 236/239] vsock: fix potential deadlock in transport->release()

Some transports (hyperv, virtio) acquire the sock lock during the
.release() callback.

In the vsock_stream_connect() we call vsock_assign_transport(); if
the socket was previously assigned to another transport, the
vsk->transport->release() is called, but the sock lock is already
held in the vsock_stream_connect(), causing a deadlock reported by
syzbot:

    INFO: task syz-executor280:9768 blocked for more than 143 seconds.
      Not tainted 5.6.0-rc1-syzkaller #0
    "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
    syz-executor280 D27912  9768   9766 0x00000000
    Call Trace:
     context_switch kernel/sched/core.c:3386 [inline]
     __schedule+0x934/0x1f90 kernel/sched/core.c:4082
     schedule+0xdc/0x2b0 kernel/sched/core.c:4156
     __lock_sock+0x165/0x290 net/core/sock.c:2413
     lock_sock_nested+0xfe/0x120 net/core/sock.c:2938
     virtio_transport_release+0xc4/0xd60 net/vmw_vsock/virtio_transport_common.c:832
     vsock_assign_transport+0xf3/0x3b0 net/vmw_vsock/af_vsock.c:454
     vsock_stream_connect+0x2b3/0xc70 net/vmw_vsock/af_vsock.c:1288
     __sys_connect_file+0x161/0x1c0 net/socket.c:1857
     __sys_connect+0x174/0x1b0 net/socket.c:1874
     __do_sys_connect net/socket.c:1885 [inline]
     __se_sys_connect net/socket.c:1882 [inline]
     __x64_sys_connect+0x73/0xb0 net/socket.c:1882
     do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
     entry_SYSCALL_64_after_hwframe+0x49/0xbe

To avoid this issue, this patch remove the lock acquiring in the
.release() callback of hyperv and virtio transports, and it holds
the lock when we call vsk->transport->release() in the vsock core.

Reported-by: syzbot+731710996d79d0d58fbc@syzkaller.appspotmail.com
Fixes: 408624af4c89 ("vsock: use local transport when it is loaded")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/af_vsock.c                | 20 ++++++++++++--------
 net/vmw_vsock/hyperv_transport.c        |  3 ---
 net/vmw_vsock/virtio_transport_common.c |  2 --
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 9c5b2a91baad..a5f28708e0e7 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -451,6 +451,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
 		if (vsk->transport == new_transport)
 			return 0;
 
+		/* transport->release() must be called with sock lock acquired.
+		 * This path can only be taken during vsock_stream_connect(),
+		 * where we have already held the sock lock.
+		 * In the other cases, this function is called on a new socket
+		 * which is not assigned to any transport.
+		 */
 		vsk->transport->release(vsk);
 		vsock_deassign_transport(vsk);
 	}
@@ -753,20 +759,18 @@ static void __vsock_release(struct sock *sk, int level)
 		vsk = vsock_sk(sk);
 		pending = NULL;	/* Compiler warning. */
 
-		/* The release call is supposed to use lock_sock_nested()
-		 * rather than lock_sock(), if a sock lock should be acquired.
-		 */
-		if (vsk->transport)
-			vsk->transport->release(vsk);
-		else if (sk->sk_type == SOCK_STREAM)
-			vsock_remove_sock(vsk);
-
 		/* When "level" is SINGLE_DEPTH_NESTING, use the nested
 		 * version to avoid the warning "possible recursive locking
 		 * detected". When "level" is 0, lock_sock_nested(sk, level)
 		 * is the same as lock_sock(sk).
 		 */
 		lock_sock_nested(sk, level);
+
+		if (vsk->transport)
+			vsk->transport->release(vsk);
+		else if (sk->sk_type == SOCK_STREAM)
+			vsock_remove_sock(vsk);
+
 		sock_orphan(sk);
 		sk->sk_shutdown = SHUTDOWN_MASK;
 
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 3492c021925f..630b851f8150 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -526,12 +526,9 @@ static bool hvs_close_lock_held(struct vsock_sock *vsk)
 
 static void hvs_release(struct vsock_sock *vsk)
 {
-	struct sock *sk = sk_vsock(vsk);
 	bool remove_sock;
 
-	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 	remove_sock = hvs_close_lock_held(vsk);
-	release_sock(sk);
 	if (remove_sock)
 		vsock_remove_sock(vsk);
 }
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d9f0c9c5425a..f3c4bab2f737 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -829,7 +829,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
 	struct sock *sk = &vsk->sk;
 	bool remove_sock = true;
 
-	lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 	if (sk->sk_type == SOCK_STREAM)
 		remove_sock = virtio_transport_close(vsk);
 
@@ -837,7 +836,6 @@ void virtio_transport_release(struct vsock_sock *vsk)
 		list_del(&pkt->list);
 		virtio_transport_free_pkt(pkt);
 	}
-	release_sock(sk);
 
 	if (remove_sock)
 		vsock_remove_sock(vsk);

From 23797b98909f34b75fd130369bde86f760db69d0 Mon Sep 17 00:00:00 2001
From: "Alex Maftei (amaftei)" <amaftei@solarflare.com>
Date: Wed, 26 Feb 2020 17:33:19 +0000
Subject: [PATCH 237/239] sfc: fix timestamp reconstruction at 16-bit rollover
 points

We can't just use the top bits of the last sync event as they could be
off-by-one every 65,536 seconds, giving an error in reconstruction of
65,536 seconds.

This patch uses the difference in the bottom 16 bits (mod 2^16) to
calculate an offset that needs to be applied to the last sync event to
get to the current time.

Signed-off-by: Alexandru-Mihai Maftei <amaftei@solarflare.com>
Acked-by: Martin Habets <mhabets@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/ptp.c | 38 +++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index af15a737c675..59b4f16896a8 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -560,13 +560,45 @@ efx_ptp_mac_nic_to_ktime_correction(struct efx_nic *efx,
 				    u32 nic_major, u32 nic_minor,
 				    s32 correction)
 {
+	u32 sync_timestamp;
 	ktime_t kt = { 0 };
+	s16 delta;
 
 	if (!(nic_major & 0x80000000)) {
 		WARN_ON_ONCE(nic_major >> 16);
-		/* Use the top bits from the latest sync event. */
-		nic_major &= 0xffff;
-		nic_major |= (last_sync_timestamp_major(efx) & 0xffff0000);
+
+		/* Medford provides 48 bits of timestamp, so we must get the top
+		 * 16 bits from the timesync event state.
+		 *
+		 * We only have the lower 16 bits of the time now, but we do
+		 * have a full resolution timestamp at some point in past. As
+		 * long as the difference between the (real) now and the sync
+		 * is less than 2^15, then we can reconstruct the difference
+		 * between those two numbers using only the lower 16 bits of
+		 * each.
+		 *
+		 * Put another way
+		 *
+		 * a - b = ((a mod k) - b) mod k
+		 *
+		 * when -k/2 < (a-b) < k/2. In our case k is 2^16. We know
+		 * (a mod k) and b, so can calculate the delta, a - b.
+		 *
+		 */
+		sync_timestamp = last_sync_timestamp_major(efx);
+
+		/* Because delta is s16 this does an implicit mask down to
+		 * 16 bits which is what we need, assuming
+		 * MEDFORD_TX_SECS_EVENT_BITS is 16. delta is signed so that
+		 * we can deal with the (unlikely) case of sync timestamps
+		 * arriving from the future.
+		 */
+		delta = nic_major - sync_timestamp;
+
+		/* Recover the fully specified time now, by applying the offset
+		 * to the (fully specified) sync time.
+		 */
+		nic_major = sync_timestamp + delta;
 
 		kt = ptp->nic_to_kernel_time(nic_major, nic_minor,
 					     correction);

From ac004e84164e27d69017731a97b11402a69d854b Mon Sep 17 00:00:00 2001
From: Amit Cohen <amitc@mellanox.com>
Date: Thu, 27 Feb 2020 21:07:53 +0100
Subject: [PATCH 238/239] mlxsw: pci: Wait longer before accessing the device
 after reset

During initialization the driver issues a reset to the device and waits
for 100ms before checking if the firmware is ready. The waiting is
necessary because before that the device is irresponsive and the first
read can result in a completion timeout.

While 100ms is sufficient for Spectrum-1 and Spectrum-2, it is
insufficient for Spectrum-3.

Fix this by increasing the timeout to 200ms.

Fixes: da382875c616 ("mlxsw: spectrum: Extend to support Spectrum-3 ASIC")
Signed-off-by: Amit Cohen <amitc@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
index e0d7d2d9a0c8..43fa8c85b5d9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -28,7 +28,7 @@
 #define MLXSW_PCI_SW_RESET			0xF0010
 #define MLXSW_PCI_SW_RESET_RST_BIT		BIT(0)
 #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS	900000
-#define MLXSW_PCI_SW_RESET_WAIT_MSECS		100
+#define MLXSW_PCI_SW_RESET_WAIT_MSECS		200
 #define MLXSW_PCI_FW_READY			0xA1844
 #define MLXSW_PCI_FW_READY_MASK			0xFFFF
 #define MLXSW_PCI_FW_READY_MAGIC		0x5E

From 3ee339eb28959629db33aaa2b8cde4c63c6289eb Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 27 Feb 2020 21:20:49 +0100
Subject: [PATCH 239/239] net: dsa: mv88e6xxx: Fix masking of egress port

Add missing ~ to the usage of the mask.

Reported-by: Kevin Benson <Kevin.Benson@zii.aero>
Reported-by: Chris Healy <Chris.Healy@zii.aero>
Fixes: 5c74c54ce6ff ("net: dsa: mv88e6xxx: Split monitor port configuration")
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/mv88e6xxx/global1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/global1.c b/drivers/net/dsa/mv88e6xxx/global1.c
index b016cc205f81..ca3a7a7a73c3 100644
--- a/drivers/net/dsa/mv88e6xxx/global1.c
+++ b/drivers/net/dsa/mv88e6xxx/global1.c
@@ -278,13 +278,13 @@ int mv88e6095_g1_set_egress_port(struct mv88e6xxx_chip *chip,
 	switch (direction) {
 	case MV88E6XXX_EGRESS_DIR_INGRESS:
 		dest_port_chip = &chip->ingress_dest_port;
-		reg &= MV88E6185_G1_MONITOR_CTL_INGRESS_DEST_MASK;
+		reg &= ~MV88E6185_G1_MONITOR_CTL_INGRESS_DEST_MASK;
 		reg |= port <<
 		       __bf_shf(MV88E6185_G1_MONITOR_CTL_INGRESS_DEST_MASK);
 		break;
 	case MV88E6XXX_EGRESS_DIR_EGRESS:
 		dest_port_chip = &chip->egress_dest_port;
-		reg &= MV88E6185_G1_MONITOR_CTL_EGRESS_DEST_MASK;
+		reg &= ~MV88E6185_G1_MONITOR_CTL_EGRESS_DEST_MASK;
 		reg |= port <<
 		       __bf_shf(MV88E6185_G1_MONITOR_CTL_EGRESS_DEST_MASK);
 		break;