ptp: fix the race between the release of ptp_clock and cdev

In a case when a ptp chardev (like /dev/ptp0) is open but an underlying
device is removed, closing this file leads to a race. This reproduces
easily in a kvm virtual machine:

ts# cat openptp0.c
int main() { ... fp = fopen("/dev/ptp0", "r"); ... sleep(10); }
ts# uname -r
5.5.0-rc3-46cf053e
ts# cat /proc/cmdline
... slub_debug=FZP
ts# modprobe ptp_kvm
ts# ./openptp0 &
[1] 670
opened /dev/ptp0, sleeping 10s...
ts# rmmod ptp_kvm
ts# ls /dev/ptp*
ls: cannot access '/dev/ptp*': No such file or directory
ts# ...woken up
[   48.010809] general protection fault: 0000 [#1] SMP
[   48.012502] CPU: 6 PID: 658 Comm: openptp0 Not tainted 5.5.0-rc3-46cf053e #25
[   48.014624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
[   48.016270] RIP: 0010:module_put.part.0+0x7/0x80
[   48.017939] RSP: 0018:ffffb3850073be00 EFLAGS: 00010202
[   48.018339] RAX: 000000006b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: ffff89a476c00ad0
[   48.018936] RDX: fffff65a08d3ea08 RSI: 0000000000000247 RDI: 6b6b6b6b6b6b6b6b
[   48.019470] ...                                              ^^^ a slub poison
[   48.023854] Call Trace:
[   48.024050]  __fput+0x21f/0x240
[   48.024288]  task_work_run+0x79/0x90
[   48.024555]  do_exit+0x2af/0xab0
[   48.024799]  ? vfs_write+0x16a/0x190
[   48.025082]  do_group_exit+0x35/0x90
[   48.025387]  __x64_sys_exit_group+0xf/0x10
[   48.025737]  do_syscall_64+0x3d/0x130
[   48.026056]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   48.026479] RIP: 0033:0x7f53b12082f6
[   48.026792] ...
[   48.030945] Modules linked in: ptp i6300esb watchdog [last unloaded: ptp_kvm]
[   48.045001] Fixing recursive fault but reboot is needed!

This happens in:

static void __fput(struct file *file)
{   ...
    if (file->f_op->release)
        file->f_op->release(inode, file); <<< cdev is kfree'd here
    if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
             !(mode & FMODE_PATH))) {
        cdev_put(inode->i_cdev); <<< cdev fields are accessed here

Namely:

__fput()
  posix_clock_release()
    kref_put(&clk->kref, delete_clock) <<< the last reference
      delete_clock()
        delete_ptp_clock()
          kfree(ptp) <<< cdev is embedded in ptp
  cdev_put
    module_put(p->owner) <<< *p is kfree'd, bang!

Here cdev is embedded in posix_clock which is embedded in ptp_clock.
The race happens because ptp_clock's lifetime is controlled by two
refcounts: kref and cdev.kobj in posix_clock. This is wrong.

Make ptp_clock's sysfs device a parent of cdev with cdev_device_add()
created especially for such cases. This way the parent device with its
ptp_clock is not released until all references to the cdev are released.
This adds a requirement that an initialized but not exposed struct
device should be provided to posix_clock_register() by a caller instead
of a simple dev_t.

This approach was adopted from the commit 72139dfa24 ("watchdog: Fix
the race between the release of watchdog_core_data and cdev"). See
details of the implementation in the commit 233ed09d7f ("chardev: add
helper function to register char devs with a struct device").

Link: https://lore.kernel.org/linux-fsdevel/20191125125342.6189-1-vdronov@redhat.com/T/#u
Analyzed-by: Stephen Johnston <sjohnsto@redhat.com>
Analyzed-by: Vern Lovejoy <vlovejoy@redhat.com>
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Vladis Dronov 2019-12-27 03:26:27 +01:00 committed by David S. Miller
parent 54fa49ee88
commit a33121e548
4 changed files with 39 additions and 44 deletions

View File

@ -166,9 +166,9 @@ static struct posix_clock_operations ptp_clock_ops = {
.read = ptp_read, .read = ptp_read,
}; };
static void delete_ptp_clock(struct posix_clock *pc) static void ptp_clock_release(struct device *dev)
{ {
struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev);
mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->tsevq_mux);
mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->pincfg_mux);
@ -213,7 +213,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
} }
ptp->clock.ops = ptp_clock_ops; ptp->clock.ops = ptp_clock_ops;
ptp->clock.release = delete_ptp_clock;
ptp->info = info; ptp->info = info;
ptp->devid = MKDEV(major, index); ptp->devid = MKDEV(major, index);
ptp->index = index; ptp->index = index;
@ -236,15 +235,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
if (err) if (err)
goto no_pin_groups; goto no_pin_groups;
/* Create a new device in our class. */
ptp->dev = device_create_with_groups(ptp_class, parent, ptp->devid,
ptp, ptp->pin_attr_groups,
"ptp%d", ptp->index);
if (IS_ERR(ptp->dev)) {
err = PTR_ERR(ptp->dev);
goto no_device;
}
/* Register a new PPS source. */ /* Register a new PPS source. */
if (info->pps) { if (info->pps) {
struct pps_source_info pps; struct pps_source_info pps;
@ -260,8 +250,18 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
} }
} }
/* Create a posix clock. */ /* Initialize a new device of our class in our clock structure. */
err = posix_clock_register(&ptp->clock, ptp->devid); device_initialize(&ptp->dev);
ptp->dev.devt = ptp->devid;
ptp->dev.class = ptp_class;
ptp->dev.parent = parent;
ptp->dev.groups = ptp->pin_attr_groups;
ptp->dev.release = ptp_clock_release;
dev_set_drvdata(&ptp->dev, ptp);
dev_set_name(&ptp->dev, "ptp%d", ptp->index);
/* Create a posix clock and link it to the device. */
err = posix_clock_register(&ptp->clock, &ptp->dev);
if (err) { if (err) {
pr_err("failed to create posix clock\n"); pr_err("failed to create posix clock\n");
goto no_clock; goto no_clock;
@ -273,8 +273,6 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
if (ptp->pps_source) if (ptp->pps_source)
pps_unregister_source(ptp->pps_source); pps_unregister_source(ptp->pps_source);
no_pps: no_pps:
device_destroy(ptp_class, ptp->devid);
no_device:
ptp_cleanup_pin_groups(ptp); ptp_cleanup_pin_groups(ptp);
no_pin_groups: no_pin_groups:
if (ptp->kworker) if (ptp->kworker)
@ -304,7 +302,6 @@ int ptp_clock_unregister(struct ptp_clock *ptp)
if (ptp->pps_source) if (ptp->pps_source)
pps_unregister_source(ptp->pps_source); pps_unregister_source(ptp->pps_source);
device_destroy(ptp_class, ptp->devid);
ptp_cleanup_pin_groups(ptp); ptp_cleanup_pin_groups(ptp);
posix_clock_unregister(&ptp->clock); posix_clock_unregister(&ptp->clock);

View File

@ -28,7 +28,7 @@ struct timestamp_event_queue {
struct ptp_clock { struct ptp_clock {
struct posix_clock clock; struct posix_clock clock;
struct device *dev; struct device dev;
struct ptp_clock_info *info; struct ptp_clock_info *info;
dev_t devid; dev_t devid;
int index; /* index into clocks.map */ int index; /* index into clocks.map */

View File

@ -69,29 +69,32 @@ struct posix_clock_operations {
* *
* @ops: Functional interface to the clock * @ops: Functional interface to the clock
* @cdev: Character device instance for this clock * @cdev: Character device instance for this clock
* @kref: Reference count. * @dev: Pointer to the clock's device.
* @rwsem: Protects the 'zombie' field from concurrent access. * @rwsem: Protects the 'zombie' field from concurrent access.
* @zombie: If 'zombie' is true, then the hardware has disappeared. * @zombie: If 'zombie' is true, then the hardware has disappeared.
* @release: A function to free the structure when the reference count reaches
* zero. May be NULL if structure is statically allocated.
* *
* Drivers should embed their struct posix_clock within a private * Drivers should embed their struct posix_clock within a private
* structure, obtaining a reference to it during callbacks using * structure, obtaining a reference to it during callbacks using
* container_of(). * container_of().
*
* Drivers should supply an initialized but not exposed struct device
* to posix_clock_register(). It is used to manage lifetime of the
* driver's private structure. It's 'release' field should be set to
* a release function for this private structure.
*/ */
struct posix_clock { struct posix_clock {
struct posix_clock_operations ops; struct posix_clock_operations ops;
struct cdev cdev; struct cdev cdev;
struct kref kref; struct device *dev;
struct rw_semaphore rwsem; struct rw_semaphore rwsem;
bool zombie; bool zombie;
void (*release)(struct posix_clock *clk);
}; };
/** /**
* posix_clock_register() - register a new clock * posix_clock_register() - register a new clock
* @clk: Pointer to the clock. Caller must provide 'ops' and 'release' * @clk: Pointer to the clock. Caller must provide 'ops' field
* @devid: Allocated device id * @dev: Pointer to the initialized device. Caller must provide
* 'release' field
* *
* A clock driver calls this function to register itself with the * A clock driver calls this function to register itself with the
* clock device subsystem. If 'clk' points to dynamically allocated * clock device subsystem. If 'clk' points to dynamically allocated
@ -100,7 +103,7 @@ struct posix_clock {
* *
* Returns zero on success, non-zero otherwise. * Returns zero on success, non-zero otherwise.
*/ */
int posix_clock_register(struct posix_clock *clk, dev_t devid); int posix_clock_register(struct posix_clock *clk, struct device *dev);
/** /**
* posix_clock_unregister() - unregister a clock * posix_clock_unregister() - unregister a clock

View File

@ -14,8 +14,6 @@
#include "posix-timers.h" #include "posix-timers.h"
static void delete_clock(struct kref *kref);
/* /*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale. * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/ */
@ -125,7 +123,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0; err = 0;
if (!err) { if (!err) {
kref_get(&clk->kref); get_device(clk->dev);
fp->private_data = clk; fp->private_data = clk;
} }
out: out:
@ -141,7 +139,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release) if (clk->ops.release)
err = clk->ops.release(clk); err = clk->ops.release(clk);
kref_put(&clk->kref, delete_clock); put_device(clk->dev);
fp->private_data = NULL; fp->private_data = NULL;
@ -161,38 +159,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif #endif
}; };
int posix_clock_register(struct posix_clock *clk, dev_t devid) int posix_clock_register(struct posix_clock *clk, struct device *dev)
{ {
int err; int err;
kref_init(&clk->kref);
init_rwsem(&clk->rwsem); init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations); cdev_init(&clk->cdev, &posix_clock_file_operations);
err = cdev_device_add(&clk->cdev, dev);
if (err) {
pr_err("%s unable to add device %d:%d\n",
dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
return err;
}
clk->cdev.owner = clk->ops.owner; clk->cdev.owner = clk->ops.owner;
err = cdev_add(&clk->cdev, devid, 1); clk->dev = dev;
return err; return 0;
} }
EXPORT_SYMBOL_GPL(posix_clock_register); EXPORT_SYMBOL_GPL(posix_clock_register);
static void delete_clock(struct kref *kref)
{
struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
if (clk->release)
clk->release(clk);
}
void posix_clock_unregister(struct posix_clock *clk) void posix_clock_unregister(struct posix_clock *clk)
{ {
cdev_del(&clk->cdev); cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem); down_write(&clk->rwsem);
clk->zombie = true; clk->zombie = true;
up_write(&clk->rwsem); up_write(&clk->rwsem);
kref_put(&clk->kref, delete_clock); put_device(clk->dev);
} }
EXPORT_SYMBOL_GPL(posix_clock_unregister); EXPORT_SYMBOL_GPL(posix_clock_unregister);