From f4fa3424c66255ba1e07b26becfc3d6cab0fdf65 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 May 2011 21:01:38 +0200 Subject: [PATCH 01/10] block: fix oops on !disk->queue and sysfs discard alignment display Eric Dumazet reports: ---- At boot, I have a crash in part_discard_alignment_show+0x1b/0x50 CR2 : 000006ac fault in : mov 0x2c(%rcx),%edx I suspect commit 23ceb5b7719e9276d4 (block: Remove extra discard_alignment from hd_struct) being in fault ---- Not quite known how ->queue can be NULL while the sysfs entry exists, but lets play it safe and check for a NULL queue. The rest of the sysfs show strategies in check.c do not dereference disk->queue. Reported-by: Eric Dumazet Signed-off-by: Jens Axboe --- fs/partitions/check.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 8ed4d3433199..f82e762eeca2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -256,10 +256,12 @@ ssize_t part_discard_alignment_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct gendisk *disk = dev_to_disk(dev); + unsigned int alignment = 0; - return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&disk->queue->limits, - p->start_sect)); + if (disk->queue) + alignment = queue_limit_discard_alignment(&disk->queue->limits, + p->start_sect); + return sprintf(buf, "%u\n", alignment); } ssize_t part_stat_show(struct device *dev, From a2cba2913c7623789296e39d787b01031f9c7969 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 02/10] brd: get rid of unused members from struct brd_device brd_refcnt, brd_offset, brd_sizelimit and brd_blocksize in struct brd_device seem to be copied from struct loop_device but they're not used anywhere. Let get rid of them. Signed-off-by: Namhyung Kim Signed-off-by: Jens Axboe --- drivers/block/brd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b7f51e4594f8..bae9a3d4e15b 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -35,10 +35,6 @@ */ struct brd_device { int brd_number; - int brd_refcnt; - loff_t brd_offset; - loff_t brd_sizelimit; - unsigned brd_blocksize; struct request_queue *brd_queue; struct gendisk *brd_disk; From 315980c8688c4b06713c1a5fe9d64cdf8ab57a72 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 03/10] brd: limit 'max_part' module param to DISK_MAX_PARTS The 'max_part' parameter controls the number of maximum partition a brd device can have. However if a user specifies very large value it would exceed the limitation of device minor number and can cause a kernel panic (or, at least, produce invalid device nodes in some cases). On my desktop system, following command kills the kernel. On qemu, it triggers similar oops but the kernel was alive: $ sudo modprobe brd max_part=100000 BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: [] sysfs_create_dir+0x2d/0xae PGD 7af1067 PUD 7b19067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: CPU 0 Modules linked in: brd(+) Pid: 44, comm: insmod Tainted: G W 2.6.39-qemu+ #158 Bochs Bochs RIP: 0010:[] [] sysfs_create_dir+0x2d/0xae RSP: 0018:ffff880007b15d78 EFLAGS: 00000286 RAX: ffff880007b05478 RBX: ffff880007a52760 RCX: ffff880007b15dc8 RDX: ffff880007a4f900 RSI: ffff880007b15e48 RDI: ffff880007a52760 RBP: ffff880007b15da8 R08: 0000000000000002 R09: 0000000000000000 R10: ffff880007b15e48 R11: ffff880007b05478 R12: 0000000000000000 R13: ffff880007b05478 R14: 0000000000400920 R15: 0000000000000063 FS: 0000000002160880(0063) GS:ffff880007c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000058 CR3: 0000000007b1c000 CR4: 00000000000006b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 Process insmod (pid: 44, threadinfo ffff880007b14000, task ffff880007acb980) Stack: ffff880007b15dc8 ffff880007b05478 ffff880007b15da8 00000000fffffffe ffff880007a52760 ffff880007b05478 ffff880007b15de8 ffffffff81143c0a 0000000000400920 ffff880007a52760 ffff880007b05478 0000000000000000 Call Trace: [] kobject_add_internal+0xdf/0x1a0 [] kobject_add_varg+0x41/0x50 [] kobject_add+0x64/0x66 [] blk_register_queue+0x5f/0xb8 [] add_disk+0xdf/0x289 [] brd_init+0xdf/0x1aa [brd] [] ? 0xffffffffa0003fff [] ? 0xffffffffa0003fff [] do_one_initcall+0x7a/0x12e [] sys_init_module+0x9c/0x1dc [] system_call_fastpath+0x16/0x1b Code: 89 e5 41 55 41 54 53 48 89 fb 48 83 ec 18 48 85 ff 75 04 0f 0b eb fe 48 8b 47 18 49 c7 c4 70 1e 4d 81 48 85 c0 74 04 4c 8b 60 30 8b 44 24 58 45 31 ed 0f b6 c4 85 c0 74 0d 48 8b 43 28 48 89 RIP [] sysfs_create_dir+0x2d/0xae RSP CR2: 0000000000000058 ---[ end trace aebb1175ce1f6739 ]--- Signed-off-by: Namhyung Kim Cc: Laurent Vivier Cc: stable@kernel.org Signed-off-by: Jens Axboe --- drivers/block/brd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bae9a3d4e15b..e9a19d99f928 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -581,6 +581,9 @@ static int __init brd_init(void) if (max_part > 0) part_shift = fls(max_part); + if ((1UL << part_shift) > DISK_MAX_PARTS) + return -EINVAL; + if (rd_nr > 1UL << (MINORBITS - part_shift)) return -EINVAL; From af46566885a373b0a526932484cd8fef8de7b598 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 04/10] brd: handle on-demand devices correctly When finding or allocating a ram disk device, brd_probe() did not take partition numbers into account so that it can result to a different device. Consider following example (I set CONFIG_BLK_DEV_RAM_COUNT=4 for simplicity) : $ sudo modprobe brd max_part=15 $ ls -l /dev/ram* brw-rw---- 1 root disk 1, 0 2011-05-25 15:41 /dev/ram0 brw-rw---- 1 root disk 1, 16 2011-05-25 15:41 /dev/ram1 brw-rw---- 1 root disk 1, 32 2011-05-25 15:41 /dev/ram2 brw-rw---- 1 root disk 1, 48 2011-05-25 15:41 /dev/ram3 $ sudo mknod /dev/ram4 b 1 64 $ sudo dd if=/dev/zero of=/dev/ram4 bs=4k count=256 256+0 records in 256+0 records out 1048576 bytes (1.0 MB) copied, 0.00215578 s, 486 MB/s namhyung@leonhard:linux$ ls -l /dev/ram* brw-rw---- 1 root disk 1, 0 2011-05-25 15:41 /dev/ram0 brw-rw---- 1 root disk 1, 16 2011-05-25 15:41 /dev/ram1 brw-rw---- 1 root disk 1, 32 2011-05-25 15:41 /dev/ram2 brw-rw---- 1 root disk 1, 48 2011-05-25 15:41 /dev/ram3 brw-r--r-- 1 root root 1, 64 2011-05-25 15:45 /dev/ram4 brw-rw---- 1 root disk 1, 1024 2011-05-25 15:44 /dev/ram64 After this patch, /dev/ram4 - instead of /dev/ram64 - was accessed correctly. In addition, 'range' passed to blk_register_region() should include all range of dev_t that RAMDISK_MAJOR can address. It does not need to be limited by partition numbers unless 'rd_nr' param was specified. Signed-off-by: Namhyung Kim Cc: Laurent Vivier Cc: stable@kernel.org Signed-off-by: Jens Axboe --- drivers/block/brd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e9a19d99f928..b1efa8f9ff42 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -548,7 +548,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) struct kobject *kobj; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(dev & MINORMASK); + brd = brd_init_one(MINOR(dev) >> part_shift); kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); mutex_unlock(&brd_devices_mutex); @@ -589,10 +589,10 @@ static int __init brd_init(void) if (rd_nr) { nr = rd_nr; - range = rd_nr; + range = rd_nr << part_shift; } else { nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << (MINORBITS - part_shift); + range = 1UL << MINORBITS; } if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) @@ -631,7 +631,7 @@ static void __exit brd_exit(void) unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr : 1UL << (MINORBITS - part_shift); + range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); From 13868b76ab8135389fe1d8cf1c6a4847c9fef0a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 05/10] brd: fix comment on initial device creation If 'rd_nr' param was not specified, 16 (can be adjusted via CONFIG_BLK_DEV_RAM_COUNT) devices would be created by default but comment said 1. Fix it. Signed-off-by: Namhyung Kim Signed-off-by: Jens Axboe --- drivers/block/brd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b1efa8f9ff42..d904d0da2928 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -571,10 +571,10 @@ static int __init brd_init(void) * * (1) if rd_nr is specified, create that many upfront, and this * also becomes a hard limit. - * (2) if rd_nr is not specified, create 1 rd device on module - * load, user can further extend brd device by create dev node - * themselves and have kernel automatically instantiate actual - * device on-demand. + * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT + * (default 16) rd device on module load, user can further + * extend brd device by create dev node themselves and have + * kernel automatically instantiate actual device on-demand. */ part_shift = 0; From 8892cbaf686fb18a5f0558b9fd7773b32c0c7852 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 06/10] brd: export module parameters Export 'rd_nr', 'rd_size' and 'max_part' parameters to sysfs so user can know that how many devices are allowed, how big each device is and how many partitions are supported. If 'max_part' is 0, it means simply the device doesn't support partitioning. Also note that 'max_part' can be adjusted to power of 2 minus 1 form if needed. User should check this value after the module loading if he/she want to use that number correctly (i.e. fdisk, mknod, etc.). Signed-off-by: Namhyung Kim Cc: Laurent Vivier Signed-off-by: Jens Axboe --- drivers/block/brd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index d904d0da2928..dba1c32e1ddf 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -436,11 +436,11 @@ static int rd_nr; int rd_size = CONFIG_BLK_DEV_RAM_SIZE; static int max_part; static int part_shift; -module_param(rd_nr, int, 0); +module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); -module_param(rd_size, int, 0); +module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); -module_param(max_part, int, 0); +module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); @@ -578,9 +578,20 @@ static int __init brd_init(void) */ part_shift = 0; - if (max_part > 0) + if (max_part > 0) { part_shift = fls(max_part); + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can decide correct minor number + * if [s]he want to create more devices. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + if ((1UL << part_shift) > DISK_MAX_PARTS) return -EINVAL; From 75e3f3ee3c64968d42f4843ec49e579f84b5aa0c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 May 2011 21:06:50 +0200 Subject: [PATCH 07/10] block: always allocate genhd->ev if check_events is implemented 9fd097b149 (block: unexport DISK_EVENT_MEDIA_CHANGE for legacy/fringe drivers) removed DISK_EVENT_MEDIA_CHANGE from legacy/fringe block drivers which have inadequate ->check_events(). Combined with earlier change 7c88a168da (block: don't propagate unlisted DISK_EVENTs to userland), this enables using ->check_events() for internal processing while avoiding enabling in-kernel block event polling which can lead to infinite event loop. Unfortunately, this made many drivers including floppy without any bit set in disk->events and ->async_events in which case disk_add_events() simply skipped allocation of disk->ev, which disables whole event handling. As ->check_events() is still used during open processing for revalidation, this can lead to open failure. This patch always allocates disk->ev if ->check_events is implemented. In the long term, it would make sense to simply include the event structure inline into genhd as it's now used by virtually all block devices. Signed-off-by: Tejun Heo Reported-by: Ondrej Zary Reported-by: Alex Villacis Lasso Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 2dd988723d73..95822ae25cfe 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1728,7 +1728,7 @@ static void disk_add_events(struct gendisk *disk) { struct disk_events *ev; - if (!disk->fops->check_events || !(disk->events | disk->async_events)) + if (!disk->fops->check_events) return; ev = kzalloc(sizeof(*ev), GFP_KERNEL); From 700c4f3325495d2e0e619fb48b900ec942f1470b Mon Sep 17 00:00:00 2001 From: Luca Tettamanti Date: Thu, 26 May 2011 21:07:26 +0200 Subject: [PATCH 08/10] block: remove unused variable in bio_attempt_front_merge() sector is never read inside the function. Signed-off-by: Luca Tettamanti Signed-off-by: Jens Axboe --- block/blk-core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index c8303e9d919d..dd8ae71168c5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1130,7 +1130,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; - sector_t sector; if (!ll_front_merge_fn(q, req, bio)) return false; @@ -1140,8 +1139,6 @@ static bool bio_attempt_front_merge(struct request_queue *q, if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) blk_rq_set_mixed_merge(req); - sector = bio->bi_sector; - bio->bi_next = req->bio; req->bio = bio; From d86e0e83b32bc84600adb0b6ea1fce389b266682 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 May 2011 07:44:43 +0200 Subject: [PATCH 09/10] block: export blk_{get,put}_queue() We need them in SCSI to fix a bug, but currently they are not exported to modules. Export them. Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index dd8ae71168c5..d2f8f4049abd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -345,6 +345,7 @@ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } +EXPORT_SYMBOL(blk_put_queue); /* * Note: If a driver supplied the queue lock, it should not zap that lock @@ -566,6 +567,7 @@ int blk_get_queue(struct request_queue *q) return 1; } +EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_queue *q, struct request *rq) { From ac04fee0b5c55bbac0858727a4154110b55d3f5a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 27 May 2011 07:59:25 +0200 Subject: [PATCH 10/10] loop: export module parameters Export 'max_loop' and 'max_part' parameters to sysfs so user can know that how many devices are allowed and how many partitions are supported. If 'max_loop' is 0, there is no restriction on the number of loop devices. User can create/use the devices as many as minor numbers available. If 'max_part' is 0, it means simply the device doesn't support partitioning. Also note that 'max_part' can be adjusted to power of 2 minus 1 form if needed. User should check this value after the module loading if he/she want to use that number correctly (i.e. fdisk, mknod, etc.). Signed-off-by: Namhyung Kim Cc: Laurent Vivier Signed-off-by: Jens Axboe --- drivers/block/loop.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c59a672a3de0..76c8da78212b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1540,9 +1540,9 @@ static const struct block_device_operations lo_fops = { * And now the modules code and kernel interface. */ static int max_loop; -module_param(max_loop, int, 0); +module_param(max_loop, int, S_IRUGO); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); -module_param(max_part, int, 0); +module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1688,9 +1688,20 @@ static int __init loop_init(void) */ part_shift = 0; - if (max_part > 0) + if (max_part > 0) { part_shift = fls(max_part); + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can decide correct minor number + * if [s]he want to create more devices. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + if ((1UL << part_shift) > DISK_MAX_PARTS) return -EINVAL;