From 27ea43fe2a32f63bb6f442dafc2133232b8af4a6 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Sat, 24 Oct 2015 21:15:34 +0200 Subject: [PATCH 01/49] nbd: Fix debugfs error handling Static checker complains about the implemented error handling. It is indeed wrong. We don't care about the return values of created debugfs files. We only have to check the return values of created dirs for NULL pointer. If we use a null pointer as parent directory for files, this may lead to debugfs files in wrong places. Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 55 ++++++++++++--------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index e4c5cc107934..d61a04155d99 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -892,50 +892,23 @@ static const struct file_operations nbd_dbg_flags_ops = { static int nbd_dev_dbg_init(struct nbd_device *nbd) { struct dentry *dir; - struct dentry *f; + + if (!nbd_dbg_dir) + return -EIO; dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); - if (IS_ERR_OR_NULL(dir)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n", - nbd_name(nbd), PTR_ERR(dir)); - return PTR_ERR(dir); + if (!dir) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n", + nbd_name(nbd)); + return -EIO; } nbd->dbg_dir = dir; - f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); - if (IS_ERR_OR_NULL(f)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n", - PTR_ERR(f)); - return PTR_ERR(f); - } - - f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); - if (IS_ERR_OR_NULL(f)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n", - PTR_ERR(f)); - return PTR_ERR(f); - } - - f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); - if (IS_ERR_OR_NULL(f)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n", - PTR_ERR(f)); - return PTR_ERR(f); - } - - f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); - if (IS_ERR_OR_NULL(f)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n", - PTR_ERR(f)); - return PTR_ERR(f); - } - - f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); - if (IS_ERR_OR_NULL(f)) { - dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n", - PTR_ERR(f)); - return PTR_ERR(f); - } + debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); + debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); + debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); + debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); + debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); return 0; } @@ -950,8 +923,8 @@ static int nbd_dbg_init(void) struct dentry *dbg_dir; dbg_dir = debugfs_create_dir("nbd", NULL); - if (IS_ERR(dbg_dir)) - return PTR_ERR(dbg_dir); + if (!dbg_dir) + return -EIO; nbd_dbg_dir = dbg_dir; From 23272a6754b81ff6503e09c743bb4ceeeab39997 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Thu, 29 Oct 2015 11:51:16 +0100 Subject: [PATCH 02/49] nbd: Remove signal usage As discussed on the mailing list, the usage of signals for timeout handling has a lot of potential issues. The nbd driver used for some time signals for timeouts. These signals where able to get the threads out of the blocking socket operations. This patch removes all signal usage and uses a socket shutdown instead. The socket descriptor itself is cleared later when the whole nbd device is closed. The tasks_lock is removed as we do not depend on this anymore. Instead a new lock for the socket is introduced so we can safely work with the socket in the timeout handler outside of the two main threads. Cc: Oleg Nesterov Cc: Christoph Hellwig Signed-off-by: Markus Pargmann Reviewed-by: Christoph Hellwig --- drivers/block/nbd.c | 126 +++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 78 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d61a04155d99..438f4dc549db 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -60,7 +60,8 @@ struct nbd_device { bool disconnect; /* a disconnect has been requested by user */ struct timer_list timeout_timer; - spinlock_t tasks_lock; + /* protects initialization and shutdown of the socket */ + spinlock_t sock_lock; struct task_struct *task_recv; struct task_struct *task_send; @@ -129,13 +130,20 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req) */ static void sock_shutdown(struct nbd_device *nbd) { - if (!nbd->sock) + spin_lock_irq(&nbd->sock_lock); + + if (!nbd->sock) { + spin_unlock_irq(&nbd->sock_lock); return; + } dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + sockfd_put(nbd->sock); nbd->sock = NULL; - del_timer_sync(&nbd->timeout_timer); + spin_unlock_irq(&nbd->sock_lock); + + del_timer(&nbd->timeout_timer); } static void nbd_xmit_timeout(unsigned long arg) @@ -148,17 +156,15 @@ static void nbd_xmit_timeout(unsigned long arg) nbd->disconnect = true; - spin_lock_irqsave(&nbd->tasks_lock, flags); + spin_lock_irqsave(&nbd->sock_lock, flags); - if (nbd->task_recv) - force_sig(SIGKILL, nbd->task_recv); - if (nbd->task_send) - force_sig(SIGKILL, nbd->task_send); + if (nbd->sock) + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); - spin_unlock_irqrestore(&nbd->tasks_lock, flags); + spin_unlock_irqrestore(&nbd->sock_lock, flags); - dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n"); + dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); } /* @@ -171,7 +177,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, int result; struct msghdr msg; struct kvec iov; - sigset_t blocked, oldset; unsigned long pflags = current->flags; if (unlikely(!sock)) { @@ -181,11 +186,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, return -EINVAL; } - /* Allow interception of SIGKILL only - * Don't allow other signals to interrupt the transmission */ - siginitsetinv(&blocked, sigmask(SIGKILL)); - sigprocmask(SIG_SETMASK, &blocked, &oldset); - current->flags |= PF_MEMALLOC; do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; @@ -212,7 +212,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, buf += result; } while (size > 0); - sigprocmask(SIG_SETMASK, &oldset, NULL); tsk_restore_flags(current, pflags, PF_MEMALLOC); if (!send && nbd->xmit_timeout) @@ -406,23 +405,18 @@ static int nbd_thread_recv(struct nbd_device *nbd) { struct request *req; int ret; - unsigned long flags; BUG_ON(nbd->magic != NBD_MAGIC); sk_set_memalloc(nbd->sock->sk); - spin_lock_irqsave(&nbd->tasks_lock, flags); nbd->task_recv = current; - spin_unlock_irqrestore(&nbd->tasks_lock, flags); ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - spin_lock_irqsave(&nbd->tasks_lock, flags); nbd->task_recv = NULL; - spin_unlock_irqrestore(&nbd->tasks_lock, flags); return ret; } @@ -439,19 +433,7 @@ static int nbd_thread_recv(struct nbd_device *nbd) device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - spin_lock_irqsave(&nbd->tasks_lock, flags); nbd->task_recv = NULL; - spin_unlock_irqrestore(&nbd->tasks_lock, flags); - - if (signal_pending(current)) { - ret = kernel_dequeue_signal(NULL); - dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", - task_pid_nr(current), current->comm, ret); - mutex_lock(&nbd->tx_lock); - sock_shutdown(nbd); - mutex_unlock(&nbd->tx_lock); - ret = -ETIMEDOUT; - } return ret; } @@ -544,11 +526,8 @@ static int nbd_thread_send(void *data) { struct nbd_device *nbd = data; struct request *req; - unsigned long flags; - spin_lock_irqsave(&nbd->tasks_lock, flags); nbd->task_send = current; - spin_unlock_irqrestore(&nbd->tasks_lock, flags); set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { @@ -557,17 +536,6 @@ static int nbd_thread_send(void *data) kthread_should_stop() || !list_empty(&nbd->waiting_queue)); - if (signal_pending(current)) { - int ret = kernel_dequeue_signal(NULL); - - dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", - task_pid_nr(current), current->comm, ret); - mutex_lock(&nbd->tx_lock); - sock_shutdown(nbd); - mutex_unlock(&nbd->tx_lock); - break; - } - /* extract request */ if (list_empty(&nbd->waiting_queue)) continue; @@ -582,13 +550,7 @@ static int nbd_thread_send(void *data) nbd_handle_req(nbd, req); } - spin_lock_irqsave(&nbd->tasks_lock, flags); nbd->task_send = NULL; - spin_unlock_irqrestore(&nbd->tasks_lock, flags); - - /* Clear maybe pending signals */ - if (signal_pending(current)) - kernel_dequeue_signal(NULL); return 0; } @@ -636,6 +598,25 @@ static void nbd_request_handler(struct request_queue *q) } } +static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) +{ + int ret = 0; + + spin_lock_irq(&nbd->sock_lock); + + if (nbd->sock) { + ret = -EBUSY; + goto out; + } + + nbd->sock = sock; + +out: + spin_unlock_irq(&nbd->sock_lock); + + return ret; +} + static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); @@ -668,32 +649,26 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; } - case NBD_CLEAR_SOCK: { - struct socket *sock = nbd->sock; - nbd->sock = NULL; + case NBD_CLEAR_SOCK: + sock_shutdown(nbd); nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); BUG_ON(!list_empty(&nbd->waiting_queue)); kill_bdev(bdev); - if (sock) - sockfd_put(sock); return 0; - } case NBD_SET_SOCK: { - struct socket *sock; int err; - if (nbd->sock) - return -EBUSY; - sock = sockfd_lookup(arg, &err); - if (sock) { - nbd->sock = sock; - if (max_part > 0) - bdev->bd_invalidated = 1; - nbd->disconnect = false; /* we're connected now */ - return 0; - } - return -EINVAL; + struct socket *sock = sockfd_lookup(arg, &err); + + if (!sock) + return err; + + err = nbd_set_socket(nbd, sock); + if (!err && max_part) + bdev->bd_invalidated = 1; + + return err; } case NBD_SET_BLKSIZE: @@ -734,7 +709,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_DO_IT: { struct task_struct *thread; - struct socket *sock; int error; if (nbd->task_recv) @@ -769,14 +743,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_lock(&nbd->tx_lock); sock_shutdown(nbd); - sock = nbd->sock; - nbd->sock = NULL; nbd_clear_que(nbd); kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); set_device_ro(bdev, false); - if (sock) - sockfd_put(sock); nbd->flags = 0; nbd->bytesize = 0; bdev->bd_inode->i_size = 0; @@ -1042,7 +1012,7 @@ static int __init nbd_init(void) nbd_dev[i].magic = NBD_MAGIC; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); - spin_lock_init(&nbd_dev[i].tasks_lock); + spin_lock_init(&nbd_dev[i].sock_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); mutex_init(&nbd_dev[i].tx_lock); init_timer(&nbd_dev[i].timeout_timer); From 1f7b5cf1be4351e60cf8ae7aab976503dd73c5f8 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Thu, 29 Oct 2015 12:01:34 +0100 Subject: [PATCH 03/49] nbd: Timeouts are not user requested disconnects It may be useful to know in the client that a connection timed out. The current code returns success for a timeout. This patch reports the error code -ETIMEDOUT for a timeout. Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 438f4dc549db..2e14e51b5ea3 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -57,6 +57,7 @@ struct nbd_device { int blksize; loff_t bytesize; int xmit_timeout; + bool timedout; bool disconnect; /* a disconnect has been requested by user */ struct timer_list timeout_timer; @@ -154,10 +155,9 @@ static void nbd_xmit_timeout(unsigned long arg) if (list_empty(&nbd->queue_head)) return; - nbd->disconnect = true; - spin_lock_irqsave(&nbd->sock_lock, flags); + nbd->timedout = true; if (nbd->sock) kernel_sock_shutdown(nbd->sock, SHUT_RDWR); @@ -754,7 +754,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (max_part > 0) blkdev_reread_part(bdev); if (nbd->disconnect) /* user requested, ignore socket errors */ - return 0; + error = 0; + if (nbd->timedout) + error = -ETIMEDOUT; + return error; } From 0e4f0f6f63d3416a9e529d99febfe98545427b81 Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Thu, 29 Oct 2015 12:04:51 +0100 Subject: [PATCH 04/49] nbd: Cleanup reset of nbd and bdev after a disconnect Group all variables that are reset after a disconnect into reset functions. This patch adds two of these functions, nbd_reset() and nbd_bdev_reset(). Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2e14e51b5ea3..34a46c32c24f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -617,6 +617,30 @@ static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) return ret; } +/* Reset all properties of an NBD device */ +static void nbd_reset(struct nbd_device *nbd) +{ + nbd->disconnect = false; + nbd->timedout = false; + nbd->blksize = 1024; + nbd->bytesize = 0; + set_capacity(nbd->disk, 0); + nbd->flags = 0; + nbd->xmit_timeout = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + del_timer_sync(&nbd->timeout_timer); +} + +static void nbd_bdev_reset(struct block_device *bdev) +{ + set_device_ro(bdev, false); + bdev->bd_inode->i_size = 0; + if (max_part > 0) { + blkdev_reread_part(bdev); + bdev->bd_invalidated = 1; + } +} + static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); @@ -745,19 +769,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, sock_shutdown(nbd); nbd_clear_que(nbd); kill_bdev(bdev); - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - set_device_ro(bdev, false); - nbd->flags = 0; - nbd->bytesize = 0; - bdev->bd_inode->i_size = 0; - set_capacity(nbd->disk, 0); - if (max_part > 0) - blkdev_reread_part(bdev); + nbd_bdev_reset(bdev); + if (nbd->disconnect) /* user requested, ignore socket errors */ error = 0; if (nbd->timedout) error = -ETIMEDOUT; + nbd_reset(nbd); + return error; } @@ -1023,14 +1043,12 @@ static int __init nbd_init(void) nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; init_waitqueue_head(&nbd_dev[i].active_wq); init_waitqueue_head(&nbd_dev[i].waiting_wq); - nbd_dev[i].blksize = 1024; - nbd_dev[i].bytesize = 0; disk->major = NBD_MAJOR; disk->first_minor = i << part_shift; disk->fops = &nbd_fops; disk->private_data = &nbd_dev[i]; sprintf(disk->disk_name, "nbd%d", i); - set_capacity(disk, 0); + nbd_reset(&nbd_dev[i]); add_disk(disk); } From d02cf53107792df373558851d6162dc4e4ceb95a Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Thu, 29 Oct 2015 12:06:15 +0100 Subject: [PATCH 05/49] nbd: Move flag parsing to a function nbd changes properties of the blockdevice depending on flags that were received. This patch moves this flag parsing into a separate function nbd_parse_flags(). Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 34a46c32c24f..b67500d5b338 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -641,6 +641,18 @@ static void nbd_bdev_reset(struct block_device *bdev) } } +static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev) +{ + if (nbd->flags & NBD_FLAG_READ_ONLY) + set_device_ro(bdev, true); + if (nbd->flags & NBD_FLAG_SEND_TRIM) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + if (nbd->flags & NBD_FLAG_SEND_FLUSH) + blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + else + blk_queue_flush(nbd->disk->queue, 0); +} + static int nbd_dev_dbg_init(struct nbd_device *nbd); static void nbd_dev_dbg_close(struct nbd_device *nbd); @@ -742,15 +754,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, mutex_unlock(&nbd->tx_lock); - if (nbd->flags & NBD_FLAG_READ_ONLY) - set_device_ro(bdev, true); - if (nbd->flags & NBD_FLAG_SEND_TRIM) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, - nbd->disk->queue); - if (nbd->flags & NBD_FLAG_SEND_FLUSH) - blk_queue_flush(nbd->disk->queue, REQ_FLUSH); - else - blk_queue_flush(nbd->disk->queue, 0); + nbd_parse_flags(nbd, bdev); thread = kthread_run(nbd_thread_send, nbd, "%s", nbd_name(nbd)); From da6ccaaa79caca4f38b540b651238f87215217a2 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 14 Jan 2016 13:42:32 -0500 Subject: [PATCH 06/49] nbd: ratelimit error msgs after socket close Make the "Attempted send on closed socket" error messages generated in nbd_request_handler() ratelimited. When the nbd socket is shutdown, the nbd_request_handler() function emits an error message for every request remaining in its queue. If the queue is large, this will spam a large amount of messages to the log. There's no need for a separate error message for each request, so this patch ratelimits it. In the specific case this was found, the system was virtual and the error messages were logged to the serial port, which overwhelmed it. Fixes: 4d48a542b427 ("nbd: fix I/O hang on disconnected nbds") Signed-off-by: Dan Streetman Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b67500d5b338..4c5d94146aa3 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -580,8 +580,8 @@ static void nbd_request_handler(struct request_queue *q) req, req->cmd_type); if (unlikely(!nbd->sock)) { - dev_err(disk_to_dev(nbd->disk), - "Attempted send on closed socket\n"); + dev_err_ratelimited(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); req->errors++; nbd_end_request(nbd, req); spin_lock_irq(q->queue_lock); From 949928c1c731417cc0f070912c63878b62b544f4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 17 Dec 2015 17:08:15 -0700 Subject: [PATCH 07/49] NVMe: Fix possible queue use after freed This notifies blk-mq when the tag set contains a different number of queues prior to freeing unused ones that the request queue points to. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 72ef8322d32a..08791338ce75 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1381,7 +1381,7 @@ static int nvme_kthread(void *data) static int nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i; + unsigned i, max; int ret = 0; for (i = dev->queue_count; i <= dev->max_qid; i++) { @@ -1391,7 +1391,8 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } } - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { + max = min(dev->max_qid, dev->queue_count - 1); + for (i = dev->online_queues; i <= max; i++) { ret = nvme_create_queue(dev->queues[i], i); if (ret) { nvme_free_queues(dev, i); @@ -1548,9 +1549,6 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) adminq->cq_vector = -1; goto free_queues; } - - /* Free previously allocated queues that are no longer usable */ - nvme_free_queues(dev, nr_io_queues + 1); return nvme_create_io_queues(dev); free_queues: @@ -1684,7 +1682,13 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; dev->ctrl.tagset = &dev->tagset; + } else { + blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1); + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, dev->online_queues); } + queue_work(nvme_workq, &dev->scan_work); return 0; } From f4f0f63e6f01055dfbdb7bc5e83935e1bdfa1980 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Feb 2016 12:44:03 -0700 Subject: [PATCH 08/49] nvme: fix drvdata setup for the nvme device Pass the right private data to device_create_with_groups from the beginning, and remove the superflous call to dev_set_drvdata. Signed-off-by: Christoph Hellwig Reviewed-by: Jon Derrick Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c5bf001af559..c326931d9b4d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1383,14 +1383,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, MKDEV(nvme_char_major, ctrl->instance), - dev, nvme_dev_attr_groups, + ctrl, nvme_dev_attr_groups, "nvme%d", ctrl->instance); if (IS_ERR(ctrl->device)) { ret = PTR_ERR(ctrl->device); goto out_release_instance; } get_device(ctrl->device); - dev_set_drvdata(ctrl->device, ctrl); spin_lock(&dev_list_lock); list_add_tail(&ctrl->node, &nvme_ctrl_list); From 1b3c47c182aac70c4487105d2e22a17f0193525f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 10 Feb 2016 08:51:15 -0700 Subject: [PATCH 09/49] nvme: Log the ctrl device name instead of the underlying pci device name Having the ctrl name "nvmeX" seems much more friendly than the underlying device name. Also, with other nvme transports such as the soon to come nvme-loop we don't have an underlying device so it doesn't makes sense to make up one. In order to help matching an instance name to a pci function, we add a info print in nvme_probe. Signed-off-by: Sagi Grimberg Acked-by: Keith Busch Manually fixed up the hunk in nvme_cancel_queue_ios(). Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 24 ++++++++++++------------ drivers/nvme/host/pci.c | 37 +++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c326931d9b4d..a7c29f24976c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -557,8 +557,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) unsigned short bs; if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { - dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", - __func__, ns->ctrl->instance, ns->ns_id); + dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", + __func__); return -ENODEV; } if (id->ncap == 0) { @@ -568,7 +568,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(ns->ctrl->dev, + dev_warn(disk_to_dev(ns->disk), "%s: LightNVM init failure\n", __func__); kfree(id); return -ENODEV; @@ -741,7 +741,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Device not ready; aborting %s\n", enabled ? "initialisation" : "reset"); return -ENODEV; @@ -781,7 +781,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) int ret; if (page_shift < dev_page_min) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", 1 << dev_page_min, 1 << page_shift); return -ENODEV; @@ -822,7 +822,7 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "Device shutdown incomplete; abort shutdown\n"); return -ENODEV; } @@ -844,13 +844,13 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { - dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); if (ret) { - dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); return ret; } page_shift = NVME_CAP_MPSMIN(cap) + 12; @@ -860,7 +860,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_identify_ctrl(ctrl, &id); if (ret) { - dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret); return -EIO; } @@ -937,13 +937,13 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->dev, + dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); ret = -EINVAL; goto out_unlock; } - dev_warn(ctrl->dev, + dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); kref_get(&ns->kref); mutex_unlock(&ctrl->namespaces_mutex); @@ -969,7 +969,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: - dev_warn(ctrl->dev, "resetting controller\n"); + dev_warn(ctrl->device, "resetting controller\n"); return ctrl->ops->reset_ctrl(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 08791338ce75..f2f55b504cf2 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -299,10 +299,10 @@ static void nvme_complete_async_event(struct nvme_dev *dev, switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: - dev_info(dev->dev, "rescanning\n"); + dev_info(dev->ctrl.device, "rescanning\n"); queue_work(nvme_workq, &dev->scan_work); default: - dev_warn(dev->dev, "async event result %08x\n", result); + dev_warn(dev->ctrl.device, "async event result %08x\n", result); } } @@ -708,7 +708,7 @@ static void nvme_complete_rq(struct request *req) } if (unlikely(iod->aborted)) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "completing aborted command with status: %04x\n", req->errors); } @@ -740,7 +740,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) *tag = -1; if (unlikely(cqe.command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->q_dmadev, + dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", cqe.command_id, le16_to_cpu(cqe.sq_id)); continue; @@ -908,7 +908,8 @@ static void abort_endio(struct request *req, int error) u32 result = (u32)(uintptr_t)req->special; u16 status = req->errors; - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + dev_warn(nvmeq->dev->ctrl.device, + "Abort status:%x result:%x", status, result); atomic_inc(&nvmeq->dev->ctrl.abort_limit); blk_mq_free_request(req); @@ -929,7 +930,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * shutdown, so we return BLK_EH_HANDLED. */ if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); @@ -943,7 +944,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * returned to the driver, or if this is the admin queue. */ if (!nvmeq->qid || iod->aborted) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); @@ -969,8 +970,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", - req->tag, nvmeq->qid); + dev_warn(nvmeq->dev->ctrl.device, + "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, BLK_MQ_REQ_NOWAIT); @@ -999,7 +1001,7 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved if (!blk_mq_request_started(req)) return; - dev_warn(nvmeq->q_dmadev, + dev_warn(nvmeq->dev->ctrl.device, "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); status = NVME_SC_ABORT_REQ; @@ -1355,7 +1357,7 @@ static int nvme_kthread(void *data) if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || csts & NVME_CSTS_CFS) { if (queue_work(nvme_workq, &dev->reset_work)) { - dev_warn(dev->dev, + dev_warn(dev->ctrl.device, "Failed status: %x, reset controller\n", readl(dev->bar + NVME_REG_CSTS)); } @@ -1483,7 +1485,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) * access to the admin queue, as that might be only way to fix them up. */ if (result > 0) { - dev_err(dev->dev, "Could not set queue count (%d)\n", result); + dev_err(dev->ctrl.device, + "Could not set queue count (%d)\n", result); nr_io_queues = 0; result = 0; } @@ -1947,7 +1950,7 @@ static void nvme_reset_work(struct work_struct *work) * any working I/O queue. */ if (dev->online_queues < 2) { - dev_warn(dev->dev, "IO queues not created\n"); + dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); @@ -1984,7 +1987,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->dev, "Removing after probe failure\n"); + dev_warn(dev->ctrl.device, "Removing after probe failure\n"); kref_get(&dev->ctrl.kref); if (!schedule_work(&dev->remove_work)) nvme_put_ctrl(&dev->ctrl); @@ -2081,6 +2084,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); + queue_work(nvme_workq, &dev->reset_work); return 0; @@ -2164,7 +2169,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, * shutdown the controller to quiesce. The controller will be restarted * after the slot reset through driver's slot_reset callback. */ - dev_warn(&pdev->dev, "error detected: state:%d\n", state); + dev_warn(dev->ctrl.device, "error detected: state:%d\n", state); switch (state) { case pci_channel_io_normal: return PCI_ERS_RESULT_CAN_RECOVER; @@ -2181,7 +2186,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - dev_info(&pdev->dev, "restart after slot reset\n"); + dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); queue_work(nvme_workq, &dev->reset_work); return PCI_ERS_RESULT_RECOVERED; From e439bb12e75c2807029853493fa787c6d70c763a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 10 Feb 2016 10:03:29 -0800 Subject: [PATCH 10/49] nvme/host: reference the fabric module for each bdev open callout We don't want to be able to unload the fabric driver when we have openened referenced to our namespaces. Thus, for each nvme_open we take a reference on the fabric driver and put it in nvme_release. This behavior is consistent with the scsi model. This resolves the panic when unloading a fabric module with mpath holders. Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Ian Bakshan Reviewed-by: Johannes Thumshirn Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 19 ++++++++++++++++--- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a7c29f24976c..c9cd07f02cc2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -71,11 +71,21 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) spin_lock(&dev_list_lock); ns = disk->private_data; - if (ns && !kref_get_unless_zero(&ns->kref)) - ns = NULL; + if (ns) { + if (!kref_get_unless_zero(&ns->kref)) + goto fail; + if (!try_module_get(ns->ctrl->ops->module)) + goto fail_put_ns; + } spin_unlock(&dev_list_lock); return ns; + +fail_put_ns: + kref_put(&ns->kref, nvme_free_ns); +fail: + spin_unlock(&dev_list_lock); + return NULL; } void nvme_requeue_req(struct request *req) @@ -499,7 +509,10 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) static void nvme_release(struct gendisk *disk, fmode_t mode) { - nvme_put_ns(disk->private_data); + struct nvme_ns *ns = disk->private_data; + + module_put(ns->ctrl->ops->module); + nvme_put_ns(ns); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4fb5bb737868..9f77386f7d1e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -117,6 +117,7 @@ struct nvme_ns { }; struct nvme_ctrl_ops { + struct module *module; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f2f55b504cf2..cb303ac91b9d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2036,6 +2036,7 @@ static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .module = THIS_MODULE, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, From ba0ba7d3e5266111ec865b0bf1ad48dd0e2a2314 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Wed, 10 Feb 2016 10:03:30 -0800 Subject: [PATCH 11/49] nvme: move timeout variables to core.c These variables are used by PCI driver and will also be used in the forthcoming NVMe over Fabrics drivers. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 12 ++++++++++++ drivers/nvme/host/pci.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9cd07f02cc2..0c0011b5e1b9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -33,6 +33,18 @@ #define NVME_MINORS (1U << MINORBITS) +unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); + +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); + +unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + static int nvme_major; module_param(nvme_major, int, 0); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cb303ac91b9d..53a99422d44d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -57,18 +57,6 @@ #define NVME_NR_AEN_COMMANDS 1 #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) -unsigned char admin_timeout = 60; -module_param(admin_timeout, byte, 0644); -MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); - -unsigned char nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, byte, 0644); -MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); - -unsigned char shutdown_timeout = 5; -module_param(shutdown_timeout, byte, 0644); -MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); - static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); From 9f2482b91bcd02ac2999cf04b3fb1b89e1c4d559 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Wed, 10 Feb 2016 10:03:31 -0800 Subject: [PATCH 12/49] nvme: split dev_list_lock Split dev_list_lock into one in the core and one in the PCI driver. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 2 -- drivers/nvme/host/pci.c | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0c0011b5e1b9..6eb42d24a5e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -52,7 +52,7 @@ static int nvme_char_major; module_param(nvme_char_major, int, 0); static LIST_HEAD(nvme_ctrl_list); -DEFINE_SPINLOCK(dev_list_lock); +static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9f77386f7d1e..63ba8a500ee1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -266,8 +266,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); -extern spinlock_t dev_list_lock; - struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 53a99422d44d..54e79c035913 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -65,6 +65,7 @@ module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); static LIST_HEAD(dev_list); +static DEFINE_SPINLOCK(dev_list_lock); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; From 576d55d625664a20ee4bae6500952febfb2d7b10 Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Wed, 10 Feb 2016 10:03:32 -0800 Subject: [PATCH 13/49] nvme: split pci module out of core module NVMe over Fabrics drivers are going to reuse the core, so splits nvme.ko into 2 modules: nvme-core.ko: the core part nvme.ko: the PCI driver Export symbols from nvme-core.ko. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- drivers/nvme/host/Kconfig | 6 +++++- drivers/nvme/host/Makefile | 10 ++++++---- drivers/nvme/host/core.c | 24 +++++++++++++++++++++++- drivers/nvme/host/pci.c | 13 +------------ 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 5d6237391dcd..2ed30f063a13 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -1,6 +1,10 @@ +config NVME_CORE + tristate + config BLK_DEV_NVME tristate "NVM Express block device" depends on PCI && BLOCK + select NVME_CORE ---help--- The NVM Express driver is for solid state drives directly connected to the PCI or PCI Express bus. If you know you @@ -11,7 +15,7 @@ config BLK_DEV_NVME config BLK_DEV_NVME_SCSI bool "SCSI emulation for NVMe device nodes" - depends on BLK_DEV_NVME + depends on NVME_CORE ---help--- This adds support for the SG_IO ioctl on the NVMe character and block devices nodes, as well a a translation for a small diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index 51bf90871549..9a3ca892b4a7 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,6 +1,8 @@ +obj-$(CONFIG_NVME_CORE) += nvme-core.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +nvme-core-y := core.o +nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o +nvme-core-$(CONFIG_NVM) += lightnvm.o -lightnvm-$(CONFIG_NVM) := lightnvm.o -nvme-y += core.o pci.o $(lightnvm-y) -nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o +nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6eb42d24a5e9..07b7ec699e92 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -36,10 +36,12 @@ unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); +EXPORT_SYMBOL_GPL(admin_timeout); unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); +EXPORT_SYMBOL_GPL(nvme_io_timeout); unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); @@ -110,6 +112,7 @@ void nvme_requeue_req(struct request *req) blk_mq_kick_requeue_list(req->q); spin_unlock_irqrestore(req->q->queue_lock, flags); } +EXPORT_SYMBOL_GPL(nvme_requeue_req); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags) @@ -133,6 +136,7 @@ struct request *nvme_alloc_request(struct request_queue *q, return req; } +EXPORT_SYMBOL_GPL(nvme_alloc_request); /* * Returns 0 on success. If the result is negative, it's a Linux error code; @@ -170,6 +174,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, { return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); } +EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, @@ -385,6 +390,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) *count = min(*count, nr_io_queues); return 0; } +EXPORT_SYMBOL_GPL(nvme_set_queue_count); static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { @@ -794,6 +800,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) return ret; return nvme_wait_ready(ctrl, cap, false); } +EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) { @@ -825,6 +832,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) return ret; return nvme_wait_ready(ctrl, cap, true); } +EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { @@ -855,6 +863,7 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) return ret; } +EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); /* * Initialize the cached copies of the Identify data and various controller @@ -916,6 +925,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) kfree(id); return 0; } +EXPORT_SYMBOL_GPL(nvme_init_identify); static int nvme_dev_open(struct inode *inode, struct file *file) { @@ -1321,6 +1331,7 @@ void nvme_scan_namespaces(struct nvme_ctrl *ctrl) mutex_unlock(&ctrl->namespaces_mutex); kfree(id); } +EXPORT_SYMBOL_GPL(nvme_scan_namespaces); void nvme_remove_namespaces(struct nvme_ctrl *ctrl) { @@ -1331,6 +1342,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) nvme_ns_remove(ns); mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_remove_namespaces); static DEFINE_IDA(nvme_instance_ida); @@ -1362,13 +1374,14 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) } void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) - { +{ device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); list_del(&ctrl->node); spin_unlock(&dev_list_lock); } +EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); static void nvme_free_ctrl(struct kref *kref) { @@ -1384,6 +1397,7 @@ void nvme_put_ctrl(struct nvme_ctrl *ctrl) { kref_put(&ctrl->kref, nvme_free_ctrl); } +EXPORT_SYMBOL_GPL(nvme_put_ctrl); /* * Initialize a NVMe controller structures. This needs to be called during @@ -1426,6 +1440,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, out: return ret; } +EXPORT_SYMBOL_GPL(nvme_init_ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl) { @@ -1442,6 +1457,7 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_stop_queues); void nvme_start_queues(struct nvme_ctrl *ctrl) { @@ -1455,6 +1471,7 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } mutex_unlock(&ctrl->namespaces_mutex); } +EXPORT_SYMBOL_GPL(nvme_start_queues); int __init nvme_core_init(void) { @@ -1494,3 +1511,8 @@ void nvme_core_exit(void) class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); } + +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_core_init); +module_exit(nvme_core_exit); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 54e79c035913..fec747917690 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2230,26 +2230,15 @@ static int __init nvme_init(void) if (!nvme_workq) return -ENOMEM; - result = nvme_core_init(); - if (result < 0) - goto kill_workq; - result = pci_register_driver(&nvme_driver); if (result) - goto core_exit; - return 0; - - core_exit: - nvme_core_exit(); - kill_workq: - destroy_workqueue(nvme_workq); + destroy_workqueue(nvme_workq); return result; } static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - nvme_core_exit(); destroy_workqueue(nvme_workq); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); From 37091fdd831f28a6509008542174ed324dd645bc Mon Sep 17 00:00:00 2001 From: Markus Pargmann Date: Mon, 27 Jul 2015 07:36:49 +0200 Subject: [PATCH 14/49] nbd: Create size change events for userspace The userspace needs to know when nbd devices are ready for use. Currently no events are created for the userspace which doesn't work for systemd. See the discussion here: https://github.com/systemd/systemd/pull/358 This patch uses a central point to setup the nbd-internal sizes. A ioctl to set a size does not lead to a visible size change. The size of the block device will be kept at 0 until nbd is connected. As soon as it connects, the size will be changed to the real value and a uevent is created. When disconnecting, the blockdevice is set to 0 size and another uevent is generated. Signed-off-by: Markus Pargmann --- drivers/block/nbd.c | 79 +++++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4c5d94146aa3..f6b51d76e578 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -100,6 +100,11 @@ static inline struct device *nbd_to_dev(struct nbd_device *nbd) return disk_to_dev(nbd->disk); } +static bool nbd_is_connected(struct nbd_device *nbd) +{ + return !!nbd->task_recv; +} + static const char *nbdcmd_to_ascii(int cmd) { switch (cmd) { @@ -112,6 +117,42 @@ static const char *nbdcmd_to_ascii(int cmd) return "invalid"; } +static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev) +{ + bdev->bd_inode->i_size = 0; + set_capacity(nbd->disk, 0); + kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); + + return 0; +} + +static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev) +{ + if (!nbd_is_connected(nbd)) + return; + + bdev->bd_inode->i_size = nbd->bytesize; + set_capacity(nbd->disk, nbd->bytesize >> 9); + kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE); +} + +static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev, + int blocksize, int nr_blocks) +{ + int ret; + + ret = set_blocksize(bdev, blocksize); + if (ret) + return ret; + + nbd->blksize = blocksize; + nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks; + + nbd_size_update(nbd, bdev); + + return 0; +} + static void nbd_end_request(struct nbd_device *nbd, struct request *req) { int error = req->errors ? -EIO : 0; @@ -401,7 +442,7 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_thread_recv(struct nbd_device *nbd) +static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) { struct request *req; int ret; @@ -421,6 +462,8 @@ static int nbd_thread_recv(struct nbd_device *nbd) return ret; } + nbd_size_update(nbd, bdev); + while (1) { req = nbd_read_stat(nbd); if (IS_ERR(req)) { @@ -431,6 +474,8 @@ static int nbd_thread_recv(struct nbd_device *nbd) nbd_end_request(nbd, req); } + nbd_size_clear(nbd, bdev); + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); nbd->task_recv = NULL; @@ -707,20 +752,19 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return err; } - case NBD_SET_BLKSIZE: - nbd->blksize = arg; - nbd->bytesize &= ~(nbd->blksize-1); - bdev->bd_inode->i_size = nbd->bytesize; - set_blocksize(bdev, nbd->blksize); - set_capacity(nbd->disk, nbd->bytesize >> 9); - return 0; + case NBD_SET_BLKSIZE: { + loff_t bsize = nbd->bytesize; + do_div(bsize, arg); + + return nbd_size_set(nbd, bdev, arg, bsize); + } case NBD_SET_SIZE: - nbd->bytesize = arg & ~(nbd->blksize-1); - bdev->bd_inode->i_size = nbd->bytesize; - set_blocksize(bdev, nbd->blksize); - set_capacity(nbd->disk, nbd->bytesize >> 9); - return 0; + return nbd_size_set(nbd, bdev, nbd->blksize, + arg / nbd->blksize); + + case NBD_SET_SIZE_BLOCKS: + return nbd_size_set(nbd, bdev, nbd->blksize, arg); case NBD_SET_TIMEOUT: nbd->xmit_timeout = arg * HZ; @@ -736,13 +780,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->flags = arg; return 0; - case NBD_SET_SIZE_BLOCKS: - nbd->bytesize = ((u64) arg) * nbd->blksize; - bdev->bd_inode->i_size = nbd->bytesize; - set_blocksize(bdev, nbd->blksize); - set_capacity(nbd->disk, nbd->bytesize >> 9); - return 0; - case NBD_DO_IT: { struct task_struct *thread; int error; @@ -764,7 +801,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, } nbd_dev_dbg_init(nbd); - error = nbd_thread_recv(nbd); + error = nbd_thread_recv(nbd, bdev); nbd_dev_dbg_close(nbd); kthread_stop(thread); From 9396dec916c052855dbb5b876c13d163df397319 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Feb 2016 15:59:44 +0100 Subject: [PATCH 15/49] nvme: use a work item to submit async event requests Use a dedicated work item to submit async event requests instead of the global kthread. This simplifies the code and reduces the latencies to resubmit a request once an even notification happened. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fec747917690..21b0be480fa5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -100,6 +100,7 @@ struct nvme_dev { struct work_struct reset_work; struct work_struct scan_work; struct work_struct remove_work; + struct work_struct async_work; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -281,8 +282,11 @@ static void nvme_complete_async_event(struct nvme_dev *dev, u16 status = le16_to_cpu(cqe->status) >> 1; u32 result = le32_to_cpu(cqe->result); - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { ++dev->ctrl.event_limit; + queue_work(nvme_workq, &dev->async_work); + } + if (status != NVME_SC_SUCCESS) return; @@ -816,15 +820,22 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static void nvme_submit_async_event(struct nvme_dev *dev) +static void nvme_async_event_work(struct work_struct *work) { + struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); + struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; - __nvme_submit_cmd(dev->queues[0], &c); + spin_lock_irq(&nvmeq->q_lock); + while (dev->ctrl.event_limit > 0) { + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + + --dev->ctrl.event_limit; + __nvme_submit_cmd(nvmeq, &c); + } + spin_unlock_irq(&nvmeq->q_lock); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1358,9 +1369,6 @@ static int nvme_kthread(void *data) continue; spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - - while (i == 0 && dev->ctrl.event_limit > 0) - nvme_submit_async_event(dev); spin_unlock_irq(&nvmeq->q_lock); } } @@ -1929,6 +1937,7 @@ static void nvme_reset_work(struct work_struct *work) goto free_tags; dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; + queue_work(nvme_workq, &dev->async_work); result = nvme_dev_list_add(dev); if (result) @@ -2062,6 +2071,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + INIT_WORK(&dev->async_work, nvme_async_event_work); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2115,6 +2125,7 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); + flush_work(&dev->async_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); nvme_remove_namespaces(&dev->ctrl); From 79f2b358c9ba373943a9284be2861fde58291c4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Feb 2016 15:59:45 +0100 Subject: [PATCH 16/49] nvme: don't poll the CQ from the kthread There is no reason to do unconditional polling of CQs per the NVMe spec. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 21b0be480fa5..10839f76179c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1156,9 +1156,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->qid = qid; nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; - - /* make sure queue descriptor is set before queue count, for kthread */ - mb(); dev->queue_count++; return nvmeq; @@ -1345,7 +1342,6 @@ static int nvme_kthread(void *data) set_current_state(TASK_INTERRUPTIBLE); spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { - int i; u32 csts = readl(dev->bar + NVME_REG_CSTS); /* @@ -1363,14 +1359,6 @@ static int nvme_kthread(void *data) } continue; } - for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; - if (!nvmeq) - continue; - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - } } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); From 2d55cd5f511d6fc377734473b237ac50820bfb9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Feb 2016 15:59:46 +0100 Subject: [PATCH 17/49] nvme: replace the kthread with a per-device watchdog timer The only work left in the kthread is the periodic health check for each controller. There is no need to run this from process context or keep a thread context around for it, so replace it with a simpler timer. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 112 +++++++++------------------------------- 1 file changed, 23 insertions(+), 89 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 10839f76179c..a62336051178 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -64,11 +64,7 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static LIST_HEAD(dev_list); -static DEFINE_SPINLOCK(dev_list_lock); -static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; -static wait_queue_head_t nvme_kthread_wait; struct nvme_dev; struct nvme_queue; @@ -82,7 +78,6 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct list_head node; struct nvme_queue **queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; @@ -101,6 +96,7 @@ struct nvme_dev { struct work_struct scan_work; struct work_struct remove_work; struct work_struct async_work; + struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -1334,36 +1330,26 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_kthread(void *data) +static void nvme_watchdog_timer(unsigned long data) { - struct nvme_dev *dev, *next; + struct nvme_dev *dev = (struct nvme_dev *)data; + u32 csts = readl(dev->bar + NVME_REG_CSTS); - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - spin_lock(&dev_list_lock); - list_for_each_entry_safe(dev, next, &dev_list, node) { - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* - * Skip controllers currently under reset. - */ - if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) - continue; - - if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || - csts & NVME_CSTS_CFS) { - if (queue_work(nvme_workq, &dev->reset_work)) { - dev_warn(dev->ctrl.device, - "Failed status: %x, reset controller\n", - readl(dev->bar + NVME_REG_CSTS)); - } - continue; - } + /* + * Skip controllers currently under reset. + */ + if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) && + ((csts & NVME_CSTS_CFS) || + (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) { + if (queue_work(nvme_workq, &dev->reset_work)) { + dev_warn(dev->dev, + "Failed status: 0x%x, reset controller.\n", + csts); } - spin_unlock(&dev_list_lock); - schedule_timeout(round_jiffies_relative(HZ)); + return; } - return 0; + + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); } static int nvme_create_io_queues(struct nvme_dev *dev) @@ -1777,56 +1763,12 @@ static void nvme_dev_unmap(struct nvme_dev *dev) } } -static int nvme_dev_list_add(struct nvme_dev *dev) -{ - bool start_thread = false; - - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) - return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - - return 0; -} - -/* -* Remove the node from the device list and check -* for whether or not we need to stop the nvme_thread. -*/ -static void nvme_dev_list_remove(struct nvme_dev *dev) -{ - struct task_struct *tmp = NULL; - - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { - tmp = nvme_thread; - nvme_thread = NULL; - } - spin_unlock(&dev_list_lock); - - if (tmp) - kthread_stop(tmp); -} - static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i; u32 csts = -1; - nvme_dev_list_remove(dev); + del_timer_sync(&dev->watchdog_timer); mutex_lock(&dev->shutdown_lock); if (dev->bar) { @@ -1927,9 +1869,7 @@ static void nvme_reset_work(struct work_struct *work) dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; queue_work(nvme_workq, &dev->async_work); - result = nvme_dev_list_add(dev); - if (result) - goto remove; + mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); /* * Keep the controller around but remove all namespaces if we don't have @@ -1946,8 +1886,6 @@ static void nvme_reset_work(struct work_struct *work) clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; - remove: - nvme_dev_list_remove(dev); free_tags: nvme_dev_remove_admin(dev); blk_put_queue(dev->ctrl.admin_q); @@ -2055,11 +1993,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); - INIT_LIST_HEAD(&dev->node); INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); INIT_WORK(&dev->async_work, nvme_async_event_work); + setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, + (unsigned long)dev); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2108,9 +2047,7 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); + del_timer_sync(&dev->watchdog_timer); pci_set_drvdata(pdev, NULL); flush_work(&dev->async_work); @@ -2223,8 +2160,6 @@ static int __init nvme_init(void) { int result; - init_waitqueue_head(&nvme_kthread_wait); - nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!nvme_workq) return -ENOMEM; @@ -2239,7 +2174,6 @@ static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); destroy_workqueue(nvme_workq); - BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } From 1cb3cce5eb9de335330c8a147e47e3359a51a8b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Feb 2016 15:59:47 +0100 Subject: [PATCH 18/49] nvme: return the whole CQE through the request passthrough interface Both LighNVM and NVMe over Fabrics need to look at more than just the status and result field. Signed-off-by: Christoph Hellwig Reviewed-by: Matias Bj?rling Reviewed-by: Jay Freyensee Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 27 +++++++++++++++++++-------- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/host/pci.c | 11 +++-------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07b7ec699e92..66fd3d9e4d47 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -132,7 +132,6 @@ struct request *nvme_alloc_request(struct request_queue *q, req->cmd = (unsigned char *)cmd; req->cmd_len = sizeof(struct nvme_command); - req->special = (void *)0; return req; } @@ -143,7 +142,8 @@ EXPORT_SYMBOL_GPL(nvme_alloc_request); * if the result is positive, it's an NVM Express status code */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen, u32 *result, unsigned timeout) + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout) { struct request *req; int ret; @@ -153,6 +153,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = cqe; if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); @@ -161,8 +162,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } blk_execute_rq(req->q, NULL, req, 0); - if (result) - *result = (u32)(uintptr_t)req->special; ret = req->errors; out: blk_mq_free_request(req); @@ -172,7 +171,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { - return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -182,6 +181,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, u32 *result, unsigned timeout) { bool write = cmd->common.opcode & 1; + struct nvme_completion cqe; struct nvme_ns *ns = q->queuedata; struct gendisk *disk = ns ? ns->disk : NULL; struct request *req; @@ -194,6 +194,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + req->special = &cqe; if (ubuffer && bufflen) { ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, @@ -248,7 +249,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, blk_execute_rq(req->q, disk, req, 0); ret = req->errors; if (result) - *result = (u32)(uintptr_t)req->special; + *result = le32_to_cpu(cqe.result); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; @@ -329,6 +330,8 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; + struct nvme_completion cqe; + int ret; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; @@ -336,13 +339,18 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + if (ret >= 0) + *result = le32_to_cpu(cqe.result); + return ret; } int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result) { struct nvme_command c; + struct nvme_completion cqe; + int ret; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; @@ -350,7 +358,10 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0); + if (ret >= 0) + *result = le32_to_cpu(cqe.result); + return ret; } int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 63ba8a500ee1..2ac7539fdd17 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -248,7 +248,8 @@ void nvme_requeue_req(struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen, u32 *result, unsigned timeout); + struct nvme_completion *cqe, void *buffer, unsigned bufflen, + unsigned timeout); int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a62336051178..d47b08783110 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -748,10 +748,8 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - u32 result = le32_to_cpu(cqe.result); - req->special = (void *)(uintptr_t)result; - } + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, &cqe, sizeof(cqe)); blk_mq_complete_request(req, status >> 1); } @@ -901,13 +899,10 @@ static void abort_endio(struct request *req, int error) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = iod->nvmeq; - u32 result = (u32)(uintptr_t)req->special; u16 status = req->errors; - dev_warn(nvmeq->dev->ctrl.device, - "Abort status:%x result:%x", status, result); + dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status); atomic_inc(&nvmeq->dev->ctrl.abort_limit); - blk_mq_free_request(req); } From 931e1c2204c6d00c11c5c1e2e1c20b5ca41f292d Mon Sep 17 00:00:00 2001 From: Ming Lin Date: Fri, 26 Feb 2016 13:24:19 -0800 Subject: [PATCH 19/49] nvme: expose cntlid in sysfs For NVMe over Fabrics, the cntlid will be used by systemd/udev to create link to the device, for example, /dev/disk/by-path/-- -> /dev/nvme0n1 Signed-off-by: Ming Lin Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 20 ++++++++++++++++---- drivers/nvme/host/nvme.h | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 66fd3d9e4d47..f08dccee8143 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -912,6 +912,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; + ctrl->cntlid = le16_to_cpup(&id->cntlid); memcpy(ctrl->serial, id->sn, sizeof(id->sn)); memcpy(ctrl->model, id->mn, sizeof(id->mn)); memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); @@ -1099,7 +1100,7 @@ static const struct attribute_group nvme_ns_attr_group = { .is_visible = nvme_attrs_are_visible, }; -#define nvme_show_function(field) \ +#define nvme_show_str_function(field) \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ @@ -1108,15 +1109,26 @@ static ssize_t field##_show(struct device *dev, \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); -nvme_show_function(model); -nvme_show_function(serial); -nvme_show_function(firmware_rev); +#define nvme_show_int_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%d\n", ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); +nvme_show_int_function(cntlid); static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_model.attr, &dev_attr_serial.attr, &dev_attr_firmware_rev.attr, + &dev_attr_cntlid.attr, NULL }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2ac7539fdd17..9b71fa8c75e4 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -77,6 +77,7 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; + int cntlid; u32 ctrl_config; From cfc05bd31384c4898bf2437a4de5557f3cf9803a Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:16:00 -0800 Subject: [PATCH 20/49] mtip32xx: Fix broken service thread handling Service thread does not detect the need for taskfile error hanlding. Fixed the flag condition to process taskfile error. Signed-off-by: Selvan Mani Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 6 +++--- drivers/block/mtip32xx/mtip32xx.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 9b180dbbd03c..dc445b4fc00f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2917,9 +2917,7 @@ static int mtip_service_thread(void *data) * is in progress nor error handling is active */ wait_event_interruptible(port->svc_wait, (port->flags) && - !(port->flags & MTIP_PF_PAUSE_IO)); - - set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + (port->flags & MTIP_PF_SVC_THD_WORK)); if (kthread_should_stop() || test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) @@ -2929,6 +2927,8 @@ static int mtip_service_thread(void *data) &dd->dd_flag))) goto st_out; + set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + restart_eh: /* Demux bits: start with error handling */ if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) { diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 3274784008eb..8635239c521f 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -144,6 +144,11 @@ enum { MTIP_PF_REBUILD_BIT = 6, MTIP_PF_SVC_THD_STOP_BIT = 8, + MTIP_PF_SVC_THD_WORK = ((1 << MTIP_PF_EH_ACTIVE_BIT) | + (1 << MTIP_PF_ISSUE_CMDS_BIT) | + (1 << MTIP_PF_REBUILD_BIT) | + (1 << MTIP_PF_SVC_THD_STOP_BIT)), + /* below are bit numbers in 'dd_flag' defined in driver_data */ MTIP_DDF_SEC_LOCK_BIT = 0, MTIP_DDF_REMOVE_PENDING_BIT = 1, From e35b94738a2f7caa12017f69ef385cb6b8028965 Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:16:21 -0800 Subject: [PATCH 21/49] mtip32xx: Remove unwanted code from taskfile error handler Remove setting and clearing MTIP_PF_EH_ACTIVE_BIT flag in mtip_handle_tfe() as they are redundant. Also avoid waking up service thread from mtip_handle_tfe() because it is already woken up in case of taskfile error. Signed-off-by: Selvan Mani Signed-off-by: Rajesh Kumar Sambandam Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index dc445b4fc00f..be9ad9ca4fe1 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -618,8 +618,6 @@ static void mtip_handle_tfe(struct driver_data *dd) port = dd->port; - set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); @@ -628,7 +626,7 @@ static void mtip_handle_tfe(struct driver_data *dd) cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, PORT_IRQ_TF_ERR); } - goto handle_tfe_exit; + return; } /* clear the tag accumulator */ @@ -771,11 +769,6 @@ static void mtip_handle_tfe(struct driver_data *dd) } } print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); - -handle_tfe_exit: - /* clear eh_active */ - clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); } /* From 5b7e0a8ac85e2dfd83830dc9e0b3554d153a37e3 Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:16:38 -0800 Subject: [PATCH 22/49] mtip32xx: Print exact time when an internal command is interrupted Print exact time when an internal command is interrupted. Signed-off-by: Selvan Mani Signed-off-by: Rajesh Kumar Sambandam Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index be9ad9ca4fe1..9ae4cc5c61ee 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -1092,6 +1092,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, struct mtip_cmd *int_cmd; struct driver_data *dd = port->dd; int rv = 0; + unsigned long start; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1155,6 +1156,8 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* Populate the command header */ int_cmd->command_header->byte_count = 0; + start = jiffies; + /* Issue the command to the hardware */ mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL); @@ -1165,8 +1168,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, msecs_to_jiffies(timeout))) <= 0) { if (rv == -ERESTARTSYS) { /* interrupted */ dev_err(&dd->pdev->dev, - "Internal command [%02X] was interrupted after %lu ms\n", - fis->command, timeout); + "Internal command [%02X] was interrupted after %u ms\n", + fis->command, + jiffies_to_msecs(jiffies - start)); rv = -EINTR; goto exec_ic_exit; } else if (rv == 0) /* timeout */ From d8a18d2d8f5de55666c6011ed175939d22c8e3d8 Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:17:32 -0800 Subject: [PATCH 23/49] mtip32xx: Avoid issuing standby immediate cmd during FTL rebuild Prevent standby immediate command from being issued in remove, suspend and shutdown paths, while drive is in FTL rebuild process. Signed-off-by: Selvan Mani Signed-off-by: Vignesh Gunasekaran Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 9ae4cc5c61ee..573900c073b8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3260,20 +3260,25 @@ static int mtip_hw_init(struct driver_data *dd) return rv; } -static void mtip_standby_drive(struct driver_data *dd) +static int mtip_standby_drive(struct driver_data *dd) { - if (dd->sr) - return; + int rv = 0; + if (dd->sr || !dd->port) + return -ENODEV; /* * Send standby immediate (E0h) to the drive so that it * saves its state. */ if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && - !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) - if (mtip_standby_immediate(dd->port)) + !test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) { + rv = mtip_standby_immediate(dd->port); + if (rv) dev_warn(&dd->pdev->dev, "STANDBY IMMEDIATE failed\n"); + } + return rv; } /* @@ -3331,8 +3336,7 @@ static int mtip_hw_shutdown(struct driver_data *dd) * Send standby immediate (E0h) to the drive so that it * saves its state. */ - if (!dd->sr && dd->port) - mtip_standby_immediate(dd->port); + mtip_standby_drive(dd); return 0; } @@ -3355,7 +3359,7 @@ static int mtip_hw_suspend(struct driver_data *dd) * Send standby immediate (E0h) to the drive * so that it saves its state. */ - if (mtip_standby_immediate(dd->port) != 0) { + if (mtip_standby_drive(dd) != 0) { dev_err(&dd->pdev->dev, "Failed standby-immediate command\n"); return -EFAULT; From 59cf70e236c96594d9f1e065755d8fce9df5356b Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:17:47 -0800 Subject: [PATCH 24/49] mtip32xx: Fix for rmmod crash when drive is in FTL rebuild When FTL rebuild is in progress, alloc_disk() initializes the disk but device node will be created by add_disk() only after successful completion of FTL rebuild. So, skip deletion of device node in removal path when FTL rebuild is in progress. Signed-off-by: Selvan Mani Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 573900c073b8..4c4d13aadf77 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2968,10 +2968,8 @@ static int mtip_service_thread(void *data) } if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { - if (mtip_ftl_rebuild_poll(dd) < 0) - set_bit(MTIP_DDF_REBUILD_FAILED_BIT, - &dd->dd_flag); - clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); + if (mtip_ftl_rebuild_poll(dd) == 0) + clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); } } @@ -3851,7 +3849,6 @@ static int mtip_block_initialize(struct driver_data *dd) mtip_hw_debugfs_init(dd); -skip_create_disk: memset(&dd->tags, 0, sizeof(dd->tags)); dd->tags.ops = &mtip_mq_ops; dd->tags.nr_hw_queues = 1; @@ -3881,6 +3878,7 @@ static int mtip_block_initialize(struct driver_data *dd) dd->disk->queue = dd->queue; dd->queue->queuedata = dd; +skip_create_disk: /* Initialize the protocol layer. */ wait_for_rebuild = mtip_hw_get_identify(dd); if (wait_for_rebuild < 0) { @@ -4041,7 +4039,8 @@ static int mtip_block_remove(struct driver_data *dd) dd->bdev = NULL; } if (dd->disk) { - del_gendisk(dd->disk); + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); if (dd->disk->queue) { blk_cleanup_queue(dd->queue); blk_mq_free_tag_set(&dd->tags); @@ -4082,7 +4081,8 @@ static int mtip_block_shutdown(struct driver_data *dd) dev_info(&dd->pdev->dev, "Shutting down %s ...\n", dd->disk->disk_name); - del_gendisk(dd->disk); + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); if (dd->disk->queue) { blk_cleanup_queue(dd->queue); blk_mq_free_tag_set(&dd->tags); From 51c6570eb922146470c2fe660c34585414679bd6 Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:18:10 -0800 Subject: [PATCH 25/49] mtip32xx: Handle safe removal during IO Flush inflight IOs using fsync_bdev() when the device is safely removed. Also, block further IOs in device open function. Signed-off-by: Selvan Mani Signed-off-by: Rajesh Kumar Sambandam Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 34 +++++++++++++++++++++++++++++-- drivers/block/mtip32xx/mtip32xx.h | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4c4d13aadf77..6268ea006354 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3595,6 +3595,28 @@ static int mtip_block_getgeo(struct block_device *dev, return 0; } +static int mtip_block_open(struct block_device *dev, fmode_t mode) +{ + struct driver_data *dd; + + if (dev && dev->bd_disk) { + dd = (struct driver_data *) dev->bd_disk->private_data; + + if (dd) { + if (test_bit(MTIP_DDF_REMOVAL_BIT, + &dd->dd_flag)) { + return -ENODEV; + } + return 0; + } + } + return -ENODEV; +} + +void mtip_block_release(struct gendisk *disk, fmode_t mode) +{ +} + /* * Block device operation function. * @@ -3602,6 +3624,8 @@ static int mtip_block_getgeo(struct block_device *dev, * layer. */ static const struct block_device_operations mtip_block_ops = { + .open = mtip_block_open, + .release = mtip_block_release, .ioctl = mtip_block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mtip_block_compat_ioctl, @@ -4427,7 +4451,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) struct driver_data *dd = pci_get_drvdata(pdev); unsigned long flags, to; - set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + set_bit(MTIP_DDF_REMOVAL_BIT, &dd->dd_flag); spin_lock_irqsave(&dev_lock, flags); list_del_init(&dd->online_list); @@ -4444,12 +4468,18 @@ static void mtip_pci_remove(struct pci_dev *pdev) } while (atomic_read(&dd->irq_workers_active) != 0 && time_before(jiffies, to)); + fsync_bdev(dd->bdev); + if (atomic_read(&dd->irq_workers_active) != 0) { dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); } - blk_mq_stop_hw_queues(dd->queue); + if (dd->sr) + blk_mq_stop_hw_queues(dd->queue); + + set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + /* Clean up the block layer. */ mtip_block_remove(dd); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 8635239c521f..50af742421e2 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -158,6 +158,7 @@ enum { MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, MTIP_DDF_REBUILD_FAILED_BIT = 8, + MTIP_DDF_REMOVAL_BIT = 9, MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | (1 << MTIP_DDF_SEC_LOCK_BIT) | From aae4a033868c496adae86fc6f9c3e0c405bbf360 Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:18:20 -0800 Subject: [PATCH 26/49] mtip32xx: Handle FTL rebuild failure state during device initialization Allow device initialization to finish gracefully when it is in FTL rebuild failure state. Also, recover device out of this state after successfully secure erasing it. Signed-off-by: Selvan Mani Signed-off-by: Vignesh Gunasekaran Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 6268ea006354..a64da08ccd3e 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -699,7 +699,7 @@ static void mtip_handle_tfe(struct driver_data *dd) fail_reason = "thermal shutdown"; } if (buf[288] == 0xBF) { - set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); + set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); dev_info(&dd->pdev->dev, "Drive indicates rebuild has failed. Secure erase required.\n"); fail_all_ncq_cmds = 1; @@ -1000,6 +1000,7 @@ static bool mtip_pause_ncq(struct mtip_port *port, (fis->features == 0x27 || fis->features == 0x72 || fis->features == 0x62 || fis->features == 0x26))) { clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + clear_bit(MTIP_DDF_REBUILD_FAILED_BIT, &port->dd->dd_flag); /* Com reset after secure erase or lowlevel format */ mtip_restart_port(port); clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); @@ -1166,6 +1167,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, if ((rv = wait_for_completion_interruptible_timeout( &wait, msecs_to_jiffies(timeout))) <= 0) { + if (rv == -ERESTARTSYS) { /* interrupted */ dev_err(&dd->pdev->dev, "Internal command [%02X] was interrupted after %u ms\n", @@ -3084,7 +3086,7 @@ static int mtip_hw_get_identify(struct driver_data *dd) if (buf[288] == 0xBF) { dev_info(&dd->pdev->dev, "Drive indicates rebuild has failed.\n"); - /* TODO */ + set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag); } } @@ -3687,10 +3689,9 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) rq_data_dir(rq))) { return -ENODATA; } - if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) || + test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))) return -ENODATA; - if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) - return -ENXIO; } if (rq->cmd_flags & REQ_DISCARD) { From abb0ccd185c9e31847709b86192e6c815d1f57ad Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:21:13 -0800 Subject: [PATCH 27/49] mtip32xx: Implement timeout handler Added timeout handler. Replaced blk_mq_end_request() with blk_mq_complete_request() to avoid double completion of a request. Signed-off-by: Selvan Mani Signed-off-by: Rajesh Kumar Sambandam Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 95 ++++++++++++++++++++++++++++--- drivers/block/mtip32xx/mtip32xx.h | 7 ++- 2 files changed, 92 insertions(+), 10 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index a64da08ccd3e..c8f5d8c393f7 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -233,15 +233,9 @@ static void mtip_async_complete(struct mtip_port *port, "Command tag %d failed due to TFE\n", tag); } - /* Unmap the DMA scatter list entries */ - dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction); - rq = mtip_rq_from_tag(dd, tag); - if (unlikely(cmd->unaligned)) - up(&port->cmd_slot_unal); - - blk_mq_end_request(rq, status ? -EIO : 0); + blk_mq_complete_request(rq, status); } /* @@ -2889,6 +2883,42 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) return -EFAULT; } +static void mtip_softirq_done_fn(struct request *rq) +{ + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct driver_data *dd = rq->q->queuedata; + + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, + cmd->direction); + + if (unlikely(cmd->unaligned)) + up(&dd->port->cmd_slot_unal); + + blk_mq_end_request(rq, rq->errors); +} + +static void mtip_abort_cmd(struct request *req, void *data, + bool reserved) +{ + struct driver_data *dd = data; + + dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); + + clear_bit(req->tag, dd->port->cmds_to_issue); + req->errors = -EIO; + mtip_softirq_done_fn(req); +} + +static void mtip_queue_cmd(struct request *req, void *data, + bool reserved) +{ + struct driver_data *dd = data; + + set_bit(req->tag, dd->port->cmds_to_issue); + blk_abort_request(req); +} + /* * service thread to issue queued commands * @@ -2901,7 +2931,7 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) static int mtip_service_thread(void *data) { struct driver_data *dd = (struct driver_data *)data; - unsigned long slot, slot_start, slot_wrap; + unsigned long slot, slot_start, slot_wrap, to; unsigned int num_cmd_slots = dd->slot_groups * 32; struct mtip_port *port = dd->port; @@ -2938,6 +2968,32 @@ static int mtip_service_thread(void *data) if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) goto restart_eh; + if (test_bit(MTIP_PF_TO_ACTIVE_BIT, &port->flags)) { + to = jiffies + msecs_to_jiffies(5000); + + do { + mdelay(100); + } while (atomic_read(&dd->irq_workers_active) != 0 && + time_before(jiffies, to)); + + if (atomic_read(&dd->irq_workers_active) != 0) + dev_warn(&dd->pdev->dev, + "Completion workers still active!"); + + spin_lock(dd->queue->queue_lock); + blk_mq_all_tag_busy_iter(*dd->tags.tags, + mtip_queue_cmd, dd); + spin_unlock(dd->queue->queue_lock); + + set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags); + + if (mtip_device_reset(dd)) + blk_mq_all_tag_busy_iter(*dd->tags.tags, + mtip_abort_cmd, dd); + + clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags); + } + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ @@ -3803,11 +3859,33 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, return 0; } +static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, + bool reserved) +{ + struct driver_data *dd = req->q->queuedata; + int ret = BLK_EH_RESET_TIMER; + + if (reserved) + goto exit_handler; + + if (test_bit(req->tag, dd->port->cmds_to_issue)) + goto exit_handler; + + if (test_and_set_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags)) + goto exit_handler; + + wake_up_interruptible(&dd->port->svc_wait); +exit_handler: + return ret; +} + static struct blk_mq_ops mtip_mq_ops = { .queue_rq = mtip_queue_rq, .map_queue = blk_mq_map_queue, .init_request = mtip_init_cmd, .exit_request = mtip_free_cmd, + .complete = mtip_softirq_done_fn, + .timeout = mtip_cmd_timeout, }; /* @@ -3883,6 +3961,7 @@ static int mtip_block_initialize(struct driver_data *dd) dd->tags.numa_node = dd->numa_node; dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; dd->tags.driver_data = dd; + dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS; rv = blk_mq_alloc_tag_set(&dd->tags); if (rv) { diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 50af742421e2..7617888f7944 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -134,10 +134,12 @@ enum { MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */ MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */ MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */ + MTIP_PF_TO_ACTIVE_BIT = 9, /* timeout handling */ MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | (1 << MTIP_PF_EH_ACTIVE_BIT) | (1 << MTIP_PF_SE_ACTIVE_BIT) | - (1 << MTIP_PF_DM_ACTIVE_BIT)), + (1 << MTIP_PF_DM_ACTIVE_BIT) | + (1 << MTIP_PF_TO_ACTIVE_BIT)), MTIP_PF_SVC_THD_ACTIVE_BIT = 4, MTIP_PF_ISSUE_CMDS_BIT = 5, @@ -147,7 +149,8 @@ enum { MTIP_PF_SVC_THD_WORK = ((1 << MTIP_PF_EH_ACTIVE_BIT) | (1 << MTIP_PF_ISSUE_CMDS_BIT) | (1 << MTIP_PF_REBUILD_BIT) | - (1 << MTIP_PF_SVC_THD_STOP_BIT)), + (1 << MTIP_PF_SVC_THD_STOP_BIT) | + (1 << MTIP_PF_TO_ACTIVE_BIT)), /* below are bit numbers in 'dd_flag' defined in driver_data */ MTIP_DDF_SEC_LOCK_BIT = 0, From 008e56d200225321371748d95908e6222436f06d Mon Sep 17 00:00:00 2001 From: Asai Thambi SP Date: Wed, 24 Feb 2016 21:21:20 -0800 Subject: [PATCH 28/49] mtip32xx: Cleanup queued requests after surprise removal Fail all pending requests after surprise removal of a drive. Signed-off-by: Vignesh Gunasekaran Signed-off-by: Selvan Mani Signed-off-by: Asai Thambi S P Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 78 ++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index c8f5d8c393f7..1c330b61f05d 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -173,7 +173,13 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; + if (mtip_check_surprise_removal(dd->pdev)) + return NULL; + rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED); + if (IS_ERR(rq)) + return NULL; + return blk_mq_rq_to_pdu(rq); } @@ -575,6 +581,8 @@ static void mtip_completion(struct mtip_port *port, dev_warn(&port->dd->pdev->dev, "Internal command %d completed with TFE\n", tag); + command->comp_func = NULL; + command->comp_data = NULL; complete(waiting); } @@ -1009,12 +1017,14 @@ static bool mtip_pause_ncq(struct mtip_port *port, * * @port Pointer to port data structure * @timeout Max duration to wait (ms) + * @atomic gfp_t flag to indicate blockable context or not * * return value * 0 Success * -EBUSY Commands still active */ -static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) +static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout, + gfp_t atomic) { unsigned long to; unsigned int n; @@ -1025,16 +1035,21 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) to = jiffies + msecs_to_jiffies(timeout); do { if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && - test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { + test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags) && + atomic == GFP_KERNEL) { msleep(20); continue; /* svc thd is actively issuing commands */ } - msleep(100); + if (atomic == GFP_KERNEL) + msleep(100); + else { + cpu_relax(); + udelay(100); + } + if (mtip_check_surprise_removal(port->dd->pdev)) goto err_fault; - if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) - goto err_fault; /* * Ignore s_active bit 0 of array element 0. @@ -1096,6 +1111,10 @@ static int mtip_exec_internal_command(struct mtip_port *port, } int_cmd = mtip_get_int_command(dd); + if (!int_cmd) { + dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n"); + return -EFAULT; + } set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); @@ -1108,7 +1127,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, if (fis->command != ATA_CMD_STANDBYNOW1) { /* wait for io to complete if non atomic */ if (mtip_quiesce_io(port, - MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { + MTIP_QUIESCE_IO_TIMEOUT_MS, atomic) < 0) { dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n"); mtip_put_int_command(dd, int_cmd); @@ -3347,10 +3366,6 @@ static int mtip_standby_drive(struct driver_data *dd) */ static int mtip_hw_exit(struct driver_data *dd) { - /* - * Send standby immediate (E0h) to the drive so that it - * saves its state. - */ if (!dd->sr) { /* de-initialize the port. */ mtip_deinit_port(dd->port); @@ -3967,7 +3982,7 @@ static int mtip_block_initialize(struct driver_data *dd) if (rv) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); - goto block_queue_alloc_init_error; + goto block_queue_alloc_tag_error; } /* Allocate the request queue. */ @@ -4079,8 +4094,9 @@ static int mtip_block_initialize(struct driver_data *dd) read_capacity_error: init_hw_cmds_error: blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); block_queue_alloc_init_error: + blk_mq_free_tag_set(&dd->tags); +block_queue_alloc_tag_error: mtip_hw_debugfs_exit(dd); disk_index_error: spin_lock(&rssd_index_lock); @@ -4097,6 +4113,22 @@ static int mtip_block_initialize(struct driver_data *dd) return rv; } +static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) +{ + struct driver_data *dd = (struct driver_data *)data; + struct mtip_cmd *cmd; + + if (likely(!reserv)) + blk_mq_complete_request(rq, -ENODEV); + else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) { + + cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); + if (cmd->comp_func) + cmd->comp_func(dd->port, MTIP_TAG_INTERNAL, + cmd, -ENODEV); + } +} + /* * Block layer deinitialization function. * @@ -4128,12 +4160,23 @@ static int mtip_block_remove(struct driver_data *dd) } } - if (!dd->sr) - mtip_standby_drive(dd); + if (!dd->sr) { + /* + * Explicitly wait here for IOs to quiesce, + * as mtip_standby_drive usually won't wait for IOs. + */ + if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS, + GFP_KERNEL)) + mtip_standby_drive(dd); + } else dev_info(&dd->pdev->dev, "device %s surprise removal\n", dd->disk->disk_name); + blk_mq_freeze_queue_start(dd->queue); + blk_mq_stop_hw_queues(dd->queue); + blk_mq_all_tag_busy_iter(dd->tags.tags[0], mtip_no_dev_cleanup, dd); + /* * Delete our gendisk structure. This also removes the device * from /dev @@ -4548,16 +4591,15 @@ static void mtip_pci_remove(struct pci_dev *pdev) } while (atomic_read(&dd->irq_workers_active) != 0 && time_before(jiffies, to)); - fsync_bdev(dd->bdev); + if (!dd->sr) + fsync_bdev(dd->bdev); if (atomic_read(&dd->irq_workers_active) != 0) { dev_warn(&dd->pdev->dev, "Completion workers still active!\n"); } - if (dd->sr) - blk_mq_stop_hw_queues(dd->queue); - + blk_set_queue_dying(dd->queue); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); /* Clean up the block layer. */ From 14e710fe7897e37762512d336ab081c57de579a4 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 10 Feb 2016 04:21:15 -0700 Subject: [PATCH 29/49] xen-blkfront: rename indirect descriptor parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "max" is rather ambiguous and carries pretty little meaning, the more that there are also "max_queues" and "max_ring_page_order". Make this "max_indirect_segments" instead, and at once change the type from int to uint (to match the respective variable's type). Acked-by: Roger Pau Monné Signed-off-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8a8dc91c39f7..008121bdece1 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -125,8 +125,10 @@ static const struct block_device_operations xlvbd_block_fops; */ static unsigned int xen_blkif_max_segments = 32; -module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); -MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, + S_IRUGO); +MODULE_PARM_DESC(max_indirect_segments, + "Maximum amount of segments in indirect requests (default is 32)"); static unsigned int xen_blkif_max_queues = 4; module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); From 5a7058450cbc8702f976d1f444974485c70cb525 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 10 Feb 2016 04:18:10 -0700 Subject: [PATCH 30/49] xen-blkback: advertise indirect segment support earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no reason to defer this until the connect phase, and in fact there are frontend implementations expecting this to be available earlier. Move it into the probe function. Acked-by: Roger Pau Monné Signed-off-by: Jan Beulich Cc: Bob Liu Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 876763f7f13e..16e28ee880a4 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -618,6 +618,14 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + err = xenbus_printf(XBT_NIL, dev->nodename, + "feature-max-indirect-segments", "%u", + MAX_INDIRECT_SEGMENTS); + if (err) + dev_warn(&dev->dev, + "writing %s/feature-max-indirect-segments (%d)", + dev->nodename, err); + /* Multi-queue: advertise how many queues are supported by us.*/ err = xenbus_printf(XBT_NIL, dev->nodename, "multi-queue-max-queues", "%u", xenblk_max_queues); @@ -849,11 +857,6 @@ static void connect(struct backend_info *be) dev->nodename); goto abort; } - err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", - MAX_INDIRECT_SEGMENTS); - if (err) - dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", - dev->nodename, err); err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); From 5e422cffe822874cefc1657cd287c8647b2782dd Mon Sep 17 00:00:00 2001 From: Alan Date: Fri, 19 Feb 2016 13:56:57 +0100 Subject: [PATCH 31/49] lightnvm: fix up nonsensical configure overrun checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of checking a constant 0 actually check the space available. Even better remember to allow for the header and also check the right amount of space is needed. Signed-off-by: Alan Cox Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 33224cb91c5b..782ac5d60a49 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -870,20 +870,19 @@ static int nvm_configure_by_str_event(const char *val, static int nvm_configure_get(char *buf, const struct kernel_param *kp) { - int sz = 0; - char *buf_start = buf; + int sz; struct nvm_dev *dev; - buf += sprintf(buf, "available devices:\n"); + sz = sprintf(buf, "available devices:\n"); down_write(&nvm_lock); list_for_each_entry(dev, &nvm_devices, devices) { - if (sz > 4095 - DISK_NAME_LEN) + if (sz > 4095 - DISK_NAME_LEN - 2) break; - buf += sprintf(buf, " %32s\n", dev->name); + sz += sprintf(buf + sz, " %32s\n", dev->name); } up_write(&nvm_lock); - return buf - buf_start - 1; + return sz; } static const struct kernel_param_ops nvm_configure_by_str_event_param_ops = { From d5bdec8ddb9f5fac3b351bed463a7132f6ba907b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Fri, 19 Feb 2016 13:56:58 +0100 Subject: [PATCH 32/49] lightnvm: fold get bb tbl when using dual/quad plane mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the media manager runs in dual or quad plane mode, lightnvm abstracts away plane specific commands. This poses a problem for get bad block table, as it reports bad blocks per plane, making the table either two or four times bigger than expected. Fold the bad block list before returning. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- drivers/nvme/host/lightnvm.c | 46 ++++++++++++++++++++++++++++++++---- include/linux/lightnvm.h | 6 ++--- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 782ac5d60a49..968ba7ed4158 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -250,7 +250,7 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, return 0; } - plane_cnt = (1 << dev->plane_mode); + plane_cnt = dev->plane_mode; rqd->nr_pages = plane_cnt * nr_ppas; if (dev->ops->max_phys_sect < rqd->nr_pages) diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 5cd3725e2fa4..d4f81f07f296 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -373,8 +373,31 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, return ret; } +static void nvme_nvm_bb_tbl_fold(struct nvm_dev *nvmdev, + int nr_dst_blks, u8 *dst_blks, + int nr_src_blks, u8 *src_blks) +{ + int blk, offset, pl, blktype; + + for (blk = 0; blk < nr_dst_blks; blk++) { + offset = blk * nvmdev->plane_mode; + blktype = src_blks[offset]; + + /* Bad blocks on any planes take precedence over other types */ + for (pl = 0; pl < nvmdev->plane_mode; pl++) { + if (src_blks[offset + pl] & + (NVM_BLK_T_BAD|NVM_BLK_T_GRWN_BAD)) { + blktype = src_blks[offset + pl]; + break; + } + } + + dst_blks[blk] = blktype; + } +} + static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, - int nr_blocks, nvm_bb_update_fn *update_bbtbl, + int nr_dst_blks, nvm_bb_update_fn *update_bbtbl, void *priv) { struct request_queue *q = nvmdev->q; @@ -382,7 +405,9 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; - int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; + u8 *dst_blks = NULL; + int nr_src_blks = nr_dst_blks * nvmdev->plane_mode; + int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_src_blks; int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; @@ -393,6 +418,12 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; + dst_blks = kzalloc(nr_dst_blks, GFP_KERNEL); + if (!dst_blks) { + ret = -ENOMEM; + goto out; + } + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { @@ -414,16 +445,21 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, goto out; } - if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { + if (le32_to_cpu(bb_tbl->tblks) != nr_src_blks) { ret = -EINVAL; dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", - le32_to_cpu(bb_tbl->tblks), nr_blocks); + le32_to_cpu(bb_tbl->tblks), nr_src_blks); goto out; } + nvme_nvm_bb_tbl_fold(nvmdev, nr_dst_blks, dst_blks, + nr_src_blks, bb_tbl->blk); + ppa = dev_to_generic_addr(nvmdev, ppa); - ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); + ret = update_bbtbl(ppa, nr_dst_blks, dst_blks, priv); + out: + kfree(dst_blks); kfree(bb_tbl); return ret; } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index d6750111e48e..7fa1838f7356 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -92,9 +92,9 @@ enum { NVM_ADDRMODE_CHANNEL = 1, /* Plane programming mode for LUN */ - NVM_PLANE_SINGLE = 0, - NVM_PLANE_DOUBLE = 1, - NVM_PLANE_QUAD = 2, + NVM_PLANE_SINGLE = 1, + NVM_PLANE_DOUBLE = 2, + NVM_PLANE_QUAD = 4, /* Status codes */ NVM_RSP_SUCCESS = 0x0, From fa3184b898717d696242241541b8cbcb65c5d497 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 3 Feb 2016 16:40:05 -0500 Subject: [PATCH 33/49] xen/blback: Fit the important information of the thread in 17 characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The processes names are truncated to 17, while we had the length of the process as name 20 - which meant that while we filled it out with various details - the last 3 characters (which had the queue number) never surfaced to the user-space. To simplify this and be able to fit the device name, domain id, and the queue number we remove the 'blkback' from the name. Prior to this patch the device name is "blkback.." for example: blkback.8.xvda, blkback.11.hda. With the multiqueue block backend we add "-%d" for the queue. But sadly this is already way past the limit so it gets stripped. Possible solution had been identified by Ian: http://lists.xenproject.org/archives/html/xen-devel/2015-05/msg03516.html " If you are pressed for space then the "xvd" is probably a bit redundant in a string which starts blkbk. The guest may not even call the device xvdN (iirc BSD has another prefix) any how, so having blkback say so seems of limited use anyway. Since this seems to not include a partition number how does this work in the split partition scheme? (i.e. one where the guest is given xvda1 and xvda2 rather than xvda with a partition table) [It will be 'blkback.8.xvda1', and 'blkback.11.xvda2'] Perhaps something derived from one of the schemes in http://xenbits.xen.org/docs/unstable/misc/vbd-interface.txt might be a better fit? After a bit of discussion (see http://lists.xenproject.org/archives/html/xen-devel/2015-12/msg01588.html) we settled on dropping the "blback" part. This will make it possible to have the .-: [1.xvda-0] [1.xvda-1] And we enough space to make it go up to: [32100.xvdfg9-5] Acked-by: Roger Pau Monné Reported-by: Jan Beulich Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/xenbus.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 16e28ee880a4..26aa080e243c 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -23,8 +23,7 @@ #include #include "common.h" -/* Enlarge the array size in order to fully show blkback name. */ -#define BLKBACK_NAME_LEN (20) +/* On the XenBus the max length of 'ring-ref%u'. */ #define RINGREF_NAME_LEN (20) struct backend_info { @@ -76,7 +75,7 @@ static int blkback_name(struct xen_blkif *blkif, char *buf) else devname = devpath; - snprintf(buf, BLKBACK_NAME_LEN, "blkback.%d.%s", blkif->domid, devname); + snprintf(buf, TASK_COMM_LEN, "%d.%s", blkif->domid, devname); kfree(devpath); return 0; @@ -85,7 +84,7 @@ static int blkback_name(struct xen_blkif *blkif, char *buf) static void xen_update_blkif_status(struct xen_blkif *blkif) { int err; - char name[BLKBACK_NAME_LEN]; + char name[TASK_COMM_LEN]; struct xen_blkif_ring *ring; int i; From 6adb03de406e8c92579c2e4b11640841fa908277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Sat, 20 Feb 2016 08:52:40 +0100 Subject: [PATCH 34/49] lightnvm: update closed list outside of intr context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an I/O finishes, full blocks are moved from the open to the closed list - a lock is taken to protect the list. This happens at the moment in the interrupt context, which is not correct. This patch moves this logic to the block workqueue instead, avoiding holding a spinlock without interrupt save in an interrupt context. Signed-off-by: Javier González Fixes: ff0e498bfa18 ("lightnvm: manage open and closed blocks sepa...") Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/rrpc.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index d8c75958ced3..e2710da5cd78 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -497,12 +497,21 @@ static void rrpc_gc_queue(struct work_struct *work) struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; struct nvm_lun *lun = rblk->parent->lun; + struct nvm_block *blk = rblk->parent; struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; spin_lock(&rlun->lock); list_add_tail(&rblk->prio, &rlun->prio_list); spin_unlock(&rlun->lock); + spin_lock(&lun->lock); + lun->nr_open_blocks--; + lun->nr_closed_blocks++; + blk->state &= ~NVM_BLK_ST_OPEN; + blk->state |= NVM_BLK_ST_CLOSED; + list_move_tail(&rblk->list, &rlun->closed_list); + spin_unlock(&lun->lock); + mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' is full, allow GC (sched)\n", rblk->parent->id); @@ -666,20 +675,8 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, lun = rblk->parent->lun; cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); - if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) { - struct nvm_block *blk = rblk->parent; - struct rrpc_lun *rlun = rblk->rlun; - - spin_lock(&lun->lock); - lun->nr_open_blocks--; - lun->nr_closed_blocks++; - blk->state &= ~NVM_BLK_ST_OPEN; - blk->state |= NVM_BLK_ST_CLOSED; - list_move_tail(&rblk->list, &rlun->closed_list); - spin_unlock(&lun->lock); - + if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) rrpc_run_gc(rrpc, rblk); - } } } From 4ece44af733ff63a7cd12aaa8c85afb6d9fdc664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sat, 20 Feb 2016 08:52:41 +0100 Subject: [PATCH 35/49] lightnvm: rename ->nr_pages to ->nr_sects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct rrpc->nr_pages can easily be interpreted as the number of flash pages allocated to rrpc, while it is the nr_sects. Make sure that this is reflected from the variable name. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 2 +- drivers/lightnvm/gennvm.c | 7 +++---- drivers/lightnvm/rrpc.c | 29 ++++++++++++++--------------- drivers/lightnvm/rrpc.h | 6 +++--- include/linux/lightnvm.h | 2 +- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 968ba7ed4158..1cb4b331c3e8 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -467,7 +467,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->blks_per_lun * dev->luns_per_chnl * dev->nr_chnls; - dev->total_pages = dev->total_blocks * dev->pgs_per_blk; + dev->total_secs = dev->total_blocks * dev->sec_per_blk; INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index 7fb725b16148..d65ec36a2231 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -100,14 +100,13 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) { struct nvm_dev *dev = private; struct gen_nvm *gn = dev->mp; - sector_t max_pages = dev->total_pages * (dev->sec_size >> 9); u64 elba = slba + nlb; struct gen_lun *lun; struct nvm_block *blk; u64 i; int lun_id; - if (unlikely(elba > dev->total_pages)) { + if (unlikely(elba > dev->total_secs)) { pr_err("gennvm: L2P data from device is out of bounds!\n"); return -EINVAL; } @@ -115,7 +114,7 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) for (i = 0; i < nlb; i++) { u64 pba = le64_to_cpu(entries[i]); - if (unlikely(pba >= max_pages && pba != U64_MAX)) { + if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { pr_err("gennvm: L2P data entry is out of bounds!\n"); return -EINVAL; } @@ -197,7 +196,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_l2p_tbl) { - ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, gennvm_block_map, dev); if (ret) { pr_err("gennvm: could not read L2P table.\n"); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index e2710da5cd78..c4d0b04ac521 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -552,7 +552,7 @@ static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr, struct rrpc_addr *gp; struct rrpc_rev_addr *rev; - BUG_ON(laddr >= rrpc->nr_pages); + BUG_ON(laddr >= rrpc->nr_sects); gp = &rrpc->trans_map[laddr]; spin_lock(&rrpc->rev_lock); @@ -721,7 +721,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio, for (i = 0; i < npages; i++) { /* We assume that mapping occurs at 4KB granularity */ - BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_pages)); + BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects)); gp = &rrpc->trans_map[laddr + i]; if (gp->rblk) { @@ -752,7 +752,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd, if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) return NVM_IO_REQUEUE; - BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_pages)); + BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects)); gp = &rrpc->trans_map[laddr]; if (gp->rblk) { @@ -1002,11 +1002,10 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) struct nvm_dev *dev = rrpc->dev; struct rrpc_addr *addr = rrpc->trans_map + slba; struct rrpc_rev_addr *raddr = rrpc->rev_trans_map; - sector_t max_pages = dev->total_pages * (dev->sec_size >> 9); u64 elba = slba + nlb; u64 i; - if (unlikely(elba > dev->total_pages)) { + if (unlikely(elba > dev->total_secs)) { pr_err("nvm: L2P data from device is out of bounds!\n"); return -EINVAL; } @@ -1016,7 +1015,7 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) /* LNVM treats address-spaces as silos, LBA and PBA are * equally large and zero-indexed. */ - if (unlikely(pba >= max_pages && pba != U64_MAX)) { + if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) { pr_err("nvm: L2P data entry is out of bounds!\n"); return -EINVAL; } @@ -1041,16 +1040,16 @@ static int rrpc_map_init(struct rrpc *rrpc) sector_t i; int ret; - rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_pages); + rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects); if (!rrpc->trans_map) return -ENOMEM; rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr) - * rrpc->nr_pages); + * rrpc->nr_sects); if (!rrpc->rev_trans_map) return -ENOMEM; - for (i = 0; i < rrpc->nr_pages; i++) { + for (i = 0; i < rrpc->nr_sects; i++) { struct rrpc_addr *p = &rrpc->trans_map[i]; struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i]; @@ -1062,8 +1061,8 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, - rrpc_l2p_update, rrpc); + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_secs, rrpc_l2p_update, + rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); return -EINVAL; @@ -1163,7 +1162,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) spin_lock_init(&rlun->lock); rrpc->total_blocks += dev->blks_per_lun; - rrpc->nr_pages += dev->sec_per_lun; + rrpc->nr_sects += dev->sec_per_lun; rlun->blocks = vzalloc(sizeof(struct rrpc_block) * rrpc->dev->blks_per_lun); @@ -1216,9 +1215,9 @@ static sector_t rrpc_capacity(void *private) /* cur, gc, and two emergency blocks for each lun */ reserved = rrpc->nr_luns * dev->max_pages_per_blk * 4; - provisioned = rrpc->nr_pages - reserved; + provisioned = rrpc->nr_sects - reserved; - if (reserved > rrpc->nr_pages) { + if (reserved > rrpc->nr_sects) { pr_err("rrpc: not enough space available to expose storage.\n"); return 0; } @@ -1381,7 +1380,7 @@ static void *rrpc_init(struct nvm_dev *dev, struct gendisk *tdisk, blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue)); pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n", - rrpc->nr_luns, (unsigned long long)rrpc->nr_pages); + rrpc->nr_luns, (unsigned long long)rrpc->nr_sects); mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10)); diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index ef13ac7700c8..dfca5c4d26bb 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -104,7 +104,7 @@ struct rrpc { struct rrpc_lun *luns; /* calculated values */ - unsigned long long nr_pages; + unsigned long long nr_sects; unsigned long total_blocks; /* Write strategy variables. Move these into each for structure for each @@ -205,7 +205,7 @@ static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr, unsigned pages, struct rrpc_inflight_rq *r) { - BUG_ON((laddr + pages) > rrpc->nr_pages); + BUG_ON((laddr + pages) > rrpc->nr_sects); return __rrpc_lock_laddr(rrpc, laddr, pages, r); } @@ -242,7 +242,7 @@ static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd) struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd); uint8_t pages = rqd->nr_pages; - BUG_ON((r->l_start + pages) > rrpc->nr_pages); + BUG_ON((r->l_start + pages) > rrpc->nr_sects); rrpc_unlock_laddr(rrpc, r); } diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 7fa1838f7356..8f8a74328f20 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -337,8 +337,8 @@ struct nvm_dev { int lps_per_blk; int *lptbl; - unsigned long total_pages; unsigned long total_blocks; + unsigned long total_secs; int nr_luns; unsigned max_pages_per_blk; From ed2a92a6b4b8453a0c3a20da641ec79e4b3d7ca4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Sat, 20 Feb 2016 08:52:42 +0100 Subject: [PATCH 36/49] lightnvm: remove struct nvm_dev->total_blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct nvm_dev->total_blocks was only used for calculating total sectors. Remove and instead calculate total sectors from the number of luns and its sectors. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 1cb4b331c3e8..773a55da0e20 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -463,11 +463,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->sec_per_lun = dev->sec_per_blk * dev->blks_per_lun; dev->nr_luns = dev->luns_per_chnl * dev->nr_chnls; - dev->total_blocks = dev->nr_planes * - dev->blks_per_lun * - dev->luns_per_chnl * - dev->nr_chnls; - dev->total_secs = dev->total_blocks * dev->sec_per_blk; + dev->total_secs = dev->nr_luns * dev->sec_per_lun; INIT_LIST_HEAD(&dev->online_targets); mutex_init(&dev->mlock); From afb18e0ed862fae05aa41be278c1ddb87473c7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 3 Mar 2016 14:47:53 -0700 Subject: [PATCH 37/49] lightnvm: generalize rrpc ppa calculations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In rrpc, some calculations assume a certain configuration (e.g., 1 LUN, 1 sector per page). The reason behind this was that LightNVM used a simple configuration with QEMU to test core features in the beginning. This patch relaxes these assumptions and generalizes calculation, allowing multiple luns to be used. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/rrpc.c | 48 ++++++++++++++++++++++++++--------------- drivers/lightnvm/rrpc.h | 9 ++++++++ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index c4d0b04ac521..f015fdc9c281 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -38,7 +38,7 @@ static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a) spin_lock(&rblk->lock); - div_u64_rem(a->addr, rrpc->dev->pgs_per_blk, &pg_offset); + div_u64_rem(a->addr, rrpc->dev->sec_per_blk, &pg_offset); WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages)); rblk->nr_invalid_pages++; @@ -113,14 +113,24 @@ static void rrpc_discard(struct rrpc *rrpc, struct bio *bio) static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk) { - return (rblk->next_page == rrpc->dev->pgs_per_blk); + return (rblk->next_page == rrpc->dev->sec_per_blk); } +/* Calculate relative addr for the given block, considering instantiated LUNs */ +static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk) +{ + struct nvm_block *blk = rblk->parent; + int lun_blk = blk->id % (rrpc->dev->blks_per_lun * rrpc->nr_luns); + + return lun_blk * rrpc->dev->sec_per_blk; +} + +/* Calculate global addr for the given block */ static u64 block_to_addr(struct rrpc *rrpc, struct rrpc_block *rblk) { struct nvm_block *blk = rblk->parent; - return blk->id * rrpc->dev->pgs_per_blk; + return blk->id * rrpc->dev->sec_per_blk; } static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, @@ -136,7 +146,7 @@ static struct ppa_addr linear_to_generic_addr(struct nvm_dev *dev, l.g.sec = secs; sector_div(ppa, dev->sec_per_pg); - div_u64_rem(ppa, dev->sec_per_blk, &pgs); + div_u64_rem(ppa, dev->pgs_per_blk, &pgs); l.g.pg = pgs; sector_div(ppa, dev->pgs_per_blk); @@ -191,12 +201,12 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, return NULL; } - rblk = &rlun->blocks[blk->id]; + rblk = rrpc_get_rblk(rlun, blk->id); list_add_tail(&rblk->list, &rlun->open_list); spin_unlock(&lun->lock); blk->priv = rblk; - bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk); + bitmap_zero(rblk->invalid_pages, rrpc->dev->sec_per_blk); rblk->next_page = 0; rblk->nr_invalid_pages = 0; atomic_set(&rblk->data_cmnt_size, 0); @@ -286,11 +296,11 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) struct bio *bio; struct page *page; int slot; - int nr_pgs_per_blk = rrpc->dev->pgs_per_blk; + int nr_sec_per_blk = rrpc->dev->sec_per_blk; u64 phys_addr; DECLARE_COMPLETION_ONSTACK(wait); - if (bitmap_full(rblk->invalid_pages, nr_pgs_per_blk)) + if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) return 0; bio = bio_alloc(GFP_NOIO, 1); @@ -304,10 +314,10 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) return -ENOMEM; while ((slot = find_first_zero_bit(rblk->invalid_pages, - nr_pgs_per_blk)) < nr_pgs_per_blk) { + nr_sec_per_blk)) < nr_sec_per_blk) { /* Lock laddr */ - phys_addr = (rblk->parent->id * nr_pgs_per_blk) + slot; + phys_addr = rblk->parent->id * nr_sec_per_blk + slot; try: spin_lock(&rrpc->rev_lock); @@ -379,7 +389,7 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) mempool_free(page, rrpc->page_pool); bio_put(bio); - if (!bitmap_full(rblk->invalid_pages, nr_pgs_per_blk)) { + if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) { pr_err("nvm: failed to garbage collect block\n"); return -EIO; } @@ -675,7 +685,7 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, lun = rblk->parent->lun; cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); - if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) + if (unlikely(cmnt_size == rrpc->dev->sec_per_blk)) rrpc_run_gc(rrpc, rblk); } } @@ -1012,6 +1022,7 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) for (i = 0; i < nlb; i++) { u64 pba = le64_to_cpu(entries[i]); + unsigned int mod; /* LNVM treats address-spaces as silos, LBA and PBA are * equally large and zero-indexed. */ @@ -1027,8 +1038,10 @@ static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private) if (!pba) continue; + div_u64_rem(pba, rrpc->nr_sects, &mod); + addr[i].addr = pba; - raddr[pba].addr = slba + i; + raddr[mod].addr = slba + i; } return 0; @@ -1135,7 +1148,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) struct rrpc_lun *rlun; int i, j; - if (dev->pgs_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { + if (dev->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { pr_err("rrpc: number of pages per block too high."); return -EINVAL; } @@ -1236,10 +1249,11 @@ static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk) struct nvm_dev *dev = rrpc->dev; int offset; struct rrpc_addr *laddr; - u64 paddr, pladdr; + u64 bpaddr, paddr, pladdr; - for (offset = 0; offset < dev->pgs_per_blk; offset++) { - paddr = block_to_addr(rrpc, rblk) + offset; + bpaddr = block_to_rel_addr(rrpc, rblk); + for (offset = 0; offset < dev->sec_per_blk; offset++) { + paddr = bpaddr + offset; pladdr = rrpc->rev_trans_map[paddr].addr; if (pladdr == ADDR_EMPTY) diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index dfca5c4d26bb..0577c4dae05f 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -156,6 +156,15 @@ struct rrpc_rev_addr { u64 addr; }; +static inline struct rrpc_block *rrpc_get_rblk(struct rrpc_lun *rlun, + int blk_id) +{ + struct rrpc *rrpc = rlun->rrpc; + int lun_blk = blk_id % rrpc->dev->blks_per_lun; + + return &rlun->blocks[lun_blk]; +} + static inline sector_t rrpc_get_laddr(struct bio *bio) { return bio->bi_iter.bi_sector / NR_PHY_IN_LOG; From 90beb2e7a0c5143a904be04c9c03afff436e7915 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Mar 2016 08:15:48 -0700 Subject: [PATCH 38/49] mtip32xx: remove unneeded variable in mtip_cmd_timeout() We always return BLK_EH_RESET_TIMER, so no point in storing that in an integer. Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1c330b61f05d..cc2e71d0a77f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3878,7 +3878,6 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, bool reserved) { struct driver_data *dd = req->q->queuedata; - int ret = BLK_EH_RESET_TIMER; if (reserved) goto exit_handler; @@ -3891,7 +3890,7 @@ static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, wake_up_interruptible(&dd->port->svc_wait); exit_handler: - return ret; + return BLK_EH_RESET_TIMER; } static struct blk_mq_ops mtip_mq_ops = { From 5e454c67fc594150e6c0da32b388a43d40200759 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 5 Mar 2016 00:49:31 +0100 Subject: [PATCH 39/49] nbd: use correct div_s64 helper The do_div() macro now checks its arguments for the correct type, and refuses anything other than u64, so we get a warning about nbd_ioctl passing in an loff_t: drivers/block/nbd.c: In function '__nbd_ioctl': drivers/block/nbd.c:757:77: error: comparison of distinct pointer types lacks a cast [-Werror] This changes the nbd code to use div_s64() instead, which takes a signed argument. Signed-off-by: Arnd Bergmann Fixes: 37091fdd831f ("nbd: Create size change events for userspace") Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f6b51d76e578..08afbc7a2bb8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -753,8 +753,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, } case NBD_SET_BLKSIZE: { - loff_t bsize = nbd->bytesize; - do_div(bsize, arg); + loff_t bsize = div_s64(nbd->bytesize, arg); return nbd_size_set(nbd, bdev, arg, bsize); } From 08095e70783f1d8296f858d37a9e1878f5da0623 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 4 Mar 2016 13:15:17 -0700 Subject: [PATCH 40/49] NVMe: Create discard zero quirk white list The NVMe specification does not require discarded blocks return zeroes on read, but provides that behavior as a possibility. Some applications more efficiently use an SSD if reads on discarded blocks were deterministically zero, based on the "discard_zeroes_data" queue attribute. There is no specification defined way to determine device behavior on discarded blocks, so the driver always left the queue setting disabled. We can only know behavior based on individual device models, so this patch adds a flag to the NVMe "quirk" list that vendors may set if they know their controller works that way. The patch also sets the new flag for one such known device. Signed-off-by: Keith Busch Suggested-by: Artur Paszkiewicz Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 +++++++- drivers/nvme/host/nvme.h | 6 ++++++ drivers/nvme/host/pci.c | 3 ++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f08dccee8143..4304be00e556 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -582,8 +582,14 @@ static void nvme_init_integrity(struct nvme_ns *ns) static void nvme_config_discard(struct nvme_ns *ns) { + struct nvme_ctrl *ctrl = ns->ctrl; u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; + + if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES) + ns->queue->limits.discard_zeroes_data = 1; + else + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; ns->queue->limits.discard_granularity = logical_block_size; blk_queue_max_discard_sectors(ns->queue, 0xffffffff); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9b71fa8c75e4..a402a0ebf471 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -59,6 +59,12 @@ enum nvme_quirks { * correctly. */ NVME_QUIRK_IDENTIFY_CNS = (1 << 1), + + /* + * The controller deterministically returns O's on reads to discarded + * logical blocks. + */ + NVME_QUIRK_DISCARD_ZEROES = (1 << 2), }; struct nvme_ctrl { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d47b08783110..74514c767429 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2130,7 +2130,8 @@ static const struct pci_error_handlers nvme_err_handler = { static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), - .driver_data = NVME_QUIRK_STRIPE_SIZE, }, + .driver_data = NVME_QUIRK_STRIPE_SIZE | + NVME_QUIRK_DISCARD_ZEROES, }, { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, From 07cc6ef8edc47f8b4fc1e276d31127a0a5863d4d Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Fri, 26 Feb 2016 14:39:06 -0800 Subject: [PATCH 41/49] bcache: fix race of writeback thread starting before complete initialization The bch_writeback_thread might BUG_ON in read_dirty() if dc->sb==BDEV_STATE_DIRTY and bch_sectors_dirty_init has not yet completed its related initialization. This patch downs the dc->writeback_lock until after initialization is complete, thus preventing bch_writeback_thread from proceeding prematurely. See this thread: http://thread.gmane.org/gmane.linux.kernel.bcache.devel/3453 Signed-off-by: Eric Wheeler Tested-by: Marc MERLIN Cc: Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8d0ead98eb6e..b411c73bfeb3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1015,8 +1015,12 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) */ atomic_set(&dc->count, 1); - if (bch_cached_dev_writeback_start(dc)) + /* Block writeback thread, but spawn it */ + down_write(&dc->writeback_lock); + if (bch_cached_dev_writeback_start(dc)) { + up_write(&dc->writeback_lock); return -ENOMEM; + } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(dc); @@ -1028,6 +1032,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bch_cached_dev_run(dc); bcache_device_link(&dc->disk, c, "bdev"); + /* Allow the writeback thread to proceed */ + up_write(&dc->writeback_lock); + pr_info("Caching %s as %s on set %pU", bdevname(dc->bdev, buf), dc->disk.disk->disk_name, dc->disk.c->sb.set_uuid); From 9b299728ed777428b3908ac72ace5f8f84b97789 Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Fri, 26 Feb 2016 14:33:56 -0800 Subject: [PATCH 42/49] bcache: cleaned up error handling around register_cache() Fix null pointer dereference by changing register_cache() to return an int instead of being void. This allows it to return -ENOMEM or -ENODEV and enables upper layers to handle the OOM case without NULL pointer issues. See this thread: http://thread.gmane.org/gmane.linux.kernel.bcache.devel/3521 Fixes this error: gargamel:/sys/block/md5/bcache# echo /dev/sdh2 > /sys/fs/bcache/register bcache: register_cache() error opening sdh2: cannot allocate memory BUG: unable to handle kernel NULL pointer dereference at 00000000000009b8 IP: [] cache_set_flush+0x102/0x15c [bcache] PGD 120dff067 PUD 1119a3067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: veth ip6table_filter ip6_tables (...) CPU: 4 PID: 3371 Comm: kworker/4:3 Not tainted 4.4.2-amd64-i915-volpreempt-20160213bc1 #3 Hardware name: System manufacturer System Product Name/P8H67-M PRO, BIOS 3904 04/27/2013 Workqueue: events cache_set_flush [bcache] task: ffff88020d5dc280 ti: ffff88020b6f8000 task.ti: ffff88020b6f8000 RIP: 0010:[] [] cache_set_flush+0x102/0x15c [bcache] Signed-off-by: Eric Wheeler Tested-by: Marc MERLIN Cc: --- drivers/md/bcache/super.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b411c73bfeb3..6b07a0c8c729 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1835,11 +1835,12 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) return 0; } -static void register_cache(struct cache_sb *sb, struct page *sb_page, +static int register_cache(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; - const char *err = "cannot allocate memory"; + const char *err = NULL; + int ret = 0; memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; @@ -1854,27 +1855,35 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, if (blk_queue_discard(bdev_get_queue(ca->bdev))) ca->discard = CACHE_DISCARD(&ca->sb); - if (cache_alloc(sb, ca) != 0) + ret = cache_alloc(sb, ca); + if (ret != 0) goto err; - err = "error creating kobject"; - if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) - goto err; + if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) { + err = "error calling kobject_add"; + ret = -ENOMEM; + goto out; + } mutex_lock(&bch_register_lock); err = register_cache_set(ca); mutex_unlock(&bch_register_lock); - if (err) - goto err; + if (err) { + ret = -ENODEV; + goto out; + } pr_info("registered cache device %s", bdevname(bdev, name)); + out: kobject_put(&ca->kobj); - return; + err: - pr_notice("error opening %s: %s", bdevname(bdev, name), err); - goto out; + if (err) + pr_notice("error opening %s: %s", bdevname(bdev, name), err); + + return ret; } /* Global interfaces/init */ @@ -1972,7 +1981,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!ca) goto err_close; - register_cache(sb, sb_page, bdev, ca); + if (register_cache(sb, sb_page, bdev, ca) != 0) + goto err_close; } out: if (sb_page) From f8b11260a445169989d01df75d35af0f56178f95 Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Mon, 7 Mar 2016 15:17:50 -0800 Subject: [PATCH 43/49] bcache: fix cache_set_flush() NULL pointer dereference on OOM When bch_cache_set_alloc() fails to kzalloc the cache_set, the asyncronous closure handling tries to dereference a cache_set that hadn't yet been allocated inside of cache_set_flush() which is called by __cache_set_unregister() during cleanup. This appears to happen only during an OOM condition on bcache_register. Signed-off-by: Eric Wheeler Cc: stable@vger.kernel.org --- drivers/md/bcache/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6b07a0c8c729..a296425a7270 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1373,6 +1373,9 @@ static void cache_set_flush(struct closure *cl) struct btree *b; unsigned i; + if (!c) + closure_return(cl); + bch_cache_accounting_destroy(&c->accounting); kobject_put(&c->internal); From 48c7823f42da2bc881ae2e325ed40123871c2fb9 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 8 Mar 2016 10:34:54 -0700 Subject: [PATCH 44/49] NVMe: Remove unused sq_head read in completion path Signed-off-by: Jon Derrick Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74514c767429..e9f18e1d73e5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -134,7 +134,6 @@ struct nvme_queue { u32 __iomem *q_db; u16 q_depth; s16 cq_vector; - u16 sq_head; u16 sq_tail; u16 cq_head; u16 qid; @@ -719,7 +718,6 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) if ((status & 1) != phase) break; - nvmeq->sq_head = le16_to_cpu(cqe.sq_head); if (++head == nvmeq->q_depth) { head = 0; phase = !phase; From 516fdcea0db8f367bc814572cb8486de424cb0fc Mon Sep 17 00:00:00 2001 From: Don Brace Date: Tue, 8 Mar 2016 16:05:51 -0600 Subject: [PATCH 45/49] cciss: update MAINTAINERS Reviewed-by: Kevin Barnett Reviewed-by: Gerry Morong Signed-off-by: Don Brace Signed-off-by: Jens Axboe --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 30aca4aa5467..b2946df30e7f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5006,9 +5006,9 @@ F: include/linux/cciss*.h F: include/uapi/linux/cciss*.h HEWLETT-PACKARD SMART CISS RAID DRIVER (cciss) -M: Don Brace +M: Don Brace L: iss_storagedev@hp.com -L: storagedev@pmcs.com +L: esc.storagedev@microsemi.com L: linux-scsi@vger.kernel.org S: Supported F: Documentation/blockdev/cciss.txt From d436641439e0121d26b19d4268e9fb3ecd368d71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Mar 2016 09:06:01 -0600 Subject: [PATCH 46/49] cpqarray: remove it from the kernel We disabled the ability to enable this driver back in October of 2013, we should be able to safely remove it at this point. The initial goal was to remove it in 3.15, so now is the time. Signed-off-by: Jens Axboe --- Documentation/blockdev/cpqarray.txt | 93 -- MAINTAINERS | 6 - drivers/block/Kconfig | 10 - drivers/block/cpqarray.c | 1820 --------------------------- drivers/block/cpqarray.h | 126 -- drivers/block/ida_cmd.h | 349 ----- drivers/block/ida_ioctl.h | 87 -- 7 files changed, 2491 deletions(-) delete mode 100644 Documentation/blockdev/cpqarray.txt delete mode 100644 drivers/block/cpqarray.c delete mode 100644 drivers/block/cpqarray.h delete mode 100644 drivers/block/ida_cmd.h delete mode 100644 drivers/block/ida_ioctl.h diff --git a/Documentation/blockdev/cpqarray.txt b/Documentation/blockdev/cpqarray.txt deleted file mode 100644 index c7154e20ef5e..000000000000 --- a/Documentation/blockdev/cpqarray.txt +++ /dev/null @@ -1,93 +0,0 @@ -This driver is for Compaq's SMART2 Intelligent Disk Array Controllers. - -Supported Cards: ----------------- - -This driver is known to work with the following cards: - - * SMART (EISA) - * SMART-2/E (EISA) - * SMART-2/P - * SMART-2DH - * SMART-2SL - * SMART-221 - * SMART-3100ES - * SMART-3200 - * Integrated Smart Array Controller - * SA 4200 - * SA 4250ES - * SA 431 - * RAID LC2 Controller - -It should also work with some really old Disk array adapters, but I am -unable to test against these cards: - - * IDA - * IDA-2 - * IAES - - -EISA Controllers: ------------------ - -If you want to use an EISA controller you'll have to supply some -modprobe/lilo parameters. If the driver is compiled into the kernel, must -give it the controller's IO port address at boot time (it is not -necessary to specify the IRQ). For example, if you had two SMART-2/E -controllers, in EISA slots 1 and 2 you'd give it a boot argument like -this: - - smart2=0x1000,0x2000 - -If you were loading the driver as a module, you'd give load it like this: - - modprobe cpqarray eisa=0x1000,0x2000 - -You can use EISA and PCI adapters at the same time. - - -Device Naming: --------------- - -You need some entries in /dev for the ida device. MAKEDEV in the /dev -directory can make device nodes for you automatically. The device setup is -as follows: - -Major numbers: - 72 ida0 - 73 ida1 - 74 ida2 - 75 ida3 - 76 ida4 - 77 ida5 - 78 ida6 - 79 ida7 - -Minor numbers: - b7 b6 b5 b4 b3 b2 b1 b0 - |----+----| |----+----| - | | - | +-------- Partition ID (0=wholedev, 1-15 partition) - | - +-------------------- Logical Volume number - -The device naming scheme is: -/dev/ida/c0d0 Controller 0, disk 0, whole device -/dev/ida/c0d0p1 Controller 0, disk 0, partition 1 -/dev/ida/c0d0p2 Controller 0, disk 0, partition 2 -/dev/ida/c0d0p3 Controller 0, disk 0, partition 3 - -/dev/ida/c1d1 Controller 1, disk 1, whole device -/dev/ida/c1d1p1 Controller 1, disk 1, partition 1 -/dev/ida/c1d1p2 Controller 1, disk 1, partition 2 -/dev/ida/c1d1p3 Controller 1, disk 1, partition 3 - - -Changelog: -========== - -10-28-2004 : General cleanup, syntax fixes for in-kernel driver version. - James Nelson - - -1999 : Original Document diff --git a/MAINTAINERS b/MAINTAINERS index b2946df30e7f..e1d64419f741 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4988,12 +4988,6 @@ T: git git://linuxtv.org/anttip/media_tree.git S: Maintained F: drivers/media/dvb-frontends/hd29l2* -HEWLETT-PACKARD SMART2 RAID DRIVER -L: iss_storagedev@hp.com -S: Orphan -F: Documentation/blockdev/cpqarray.txt -F: drivers/block/cpqarray.* - HEWLETT-PACKARD SMART ARRAY RAID DRIVER (hpsa) M: Don Brace L: iss_storagedev@hp.com diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 29819e719afa..39dd30b6ef86 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -110,16 +110,6 @@ source "drivers/block/mtip32xx/Kconfig" source "drivers/block/zram/Kconfig" -config BLK_CPQ_DA - tristate "Compaq SMART2 support" - depends on PCI && VIRT_TO_BUS && 0 - help - This is the driver for Compaq Smart Array controllers. Everyone - using these boards should say Y here. See the file - for the current list of - boards supported by this driver, and for further information on the - use of this driver. - config BLK_CPQ_CISS_DA tristate "Compaq Smart Array 5xxx support" depends on PCI diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c deleted file mode 100644 index f749df9e15cd..000000000000 --- a/drivers/block/cpqarray.c +++ /dev/null @@ -1,1820 +0,0 @@ -/* - * Disk Array driver for Compaq SMART2 Controllers - * Copyright 1998 Compaq Computer Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Questions/Comments/Bugfixes to iss_storagedev@hp.com - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#define SMART2_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin)) - -#define DRIVER_NAME "Compaq SMART2 Driver (v 2.6.0)" -#define DRIVER_VERSION SMART2_DRIVER_VERSION(2,6,0) - -/* Embedded module documentation macros - see modules.h */ -/* Original author Chris Frantz - Compaq Computer Corporation */ -MODULE_AUTHOR("Compaq Computer Corporation"); -MODULE_DESCRIPTION("Driver for Compaq Smart2 Array Controllers version 2.6.0"); -MODULE_LICENSE("GPL"); - -#include "cpqarray.h" -#include "ida_cmd.h" -#include "smart1,2.h" -#include "ida_ioctl.h" - -#define READ_AHEAD 128 -#define NR_CMDS 128 /* This could probably go as high as ~400 */ - -#define MAX_CTLR 8 -#define CTLR_SHIFT 8 - -#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */ - -static DEFINE_MUTEX(cpqarray_mutex); -static int nr_ctlr; -static ctlr_info_t *hba[MAX_CTLR]; - -static int eisa[8]; - -#define NR_PRODUCTS ARRAY_SIZE(products) - -/* board_id = Subsystem Device ID & Vendor ID - * product = Marketing Name for the board - * access = Address of the struct of function pointers - */ -static struct board_type products[] = { - { 0x0040110E, "IDA", &smart1_access }, - { 0x0140110E, "IDA-2", &smart1_access }, - { 0x1040110E, "IAES", &smart1_access }, - { 0x2040110E, "SMART", &smart1_access }, - { 0x3040110E, "SMART-2/E", &smart2e_access }, - { 0x40300E11, "SMART-2/P", &smart2_access }, - { 0x40310E11, "SMART-2SL", &smart2_access }, - { 0x40320E11, "Smart Array 3200", &smart2_access }, - { 0x40330E11, "Smart Array 3100ES", &smart2_access }, - { 0x40340E11, "Smart Array 221", &smart2_access }, - { 0x40400E11, "Integrated Array", &smart4_access }, - { 0x40480E11, "Compaq Raid LC2", &smart4_access }, - { 0x40500E11, "Smart Array 4200", &smart4_access }, - { 0x40510E11, "Smart Array 4250ES", &smart4_access }, - { 0x40580E11, "Smart Array 431", &smart4_access }, -}; - -/* define the PCI info for the PCI cards this driver can control */ -static const struct pci_device_id cpqarray_pci_device_id[] = -{ - { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, - 0x0E11, 0x4058, 0, 0, 0}, /* SA431 */ - { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, - 0x0E11, 0x4051, 0, 0, 0}, /* SA4250ES */ - { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, - 0x0E11, 0x4050, 0, 0, 0}, /* SA4200 */ - { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510, - 0x0E11, 0x4048, 0, 0, 0}, /* LC2 */ - { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510, - 0x0E11, 0x4040, 0, 0, 0}, /* Integrated Array */ - { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, - 0x0E11, 0x4034, 0, 0, 0}, /* SA 221 */ - { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, - 0x0E11, 0x4033, 0, 0, 0}, /* SA 3100ES*/ - { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, - 0x0E11, 0x4032, 0, 0, 0}, /* SA 3200*/ - { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, - 0x0E11, 0x4031, 0, 0, 0}, /* SA 2SL*/ - { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, - 0x0E11, 0x4030, 0, 0, 0}, /* SA 2P */ - { 0 } -}; - -MODULE_DEVICE_TABLE(pci, cpqarray_pci_device_id); - -static struct gendisk *ida_gendisk[MAX_CTLR][NWD]; - -/* Debug... */ -#define DBG(s) do { s } while(0) -/* Debug (general info)... */ -#define DBGINFO(s) do { } while(0) -/* Debug Paranoid... */ -#define DBGP(s) do { } while(0) -/* Debug Extra Paranoid... */ -#define DBGPX(s) do { } while(0) - -static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev); -static void __iomem *remap_pci_mem(ulong base, ulong size); -static int cpqarray_eisa_detect(void); -static int pollcomplete(int ctlr); -static void getgeometry(int ctlr); -static void start_fwbk(int ctlr); - -static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool); -static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool); - -static void free_hba(int i); -static int alloc_cpqarray_hba(void); - -static int sendcmd( - __u8 cmd, - int ctlr, - void *buff, - size_t size, - unsigned int blk, - unsigned int blkcnt, - unsigned int log_unit ); - -static int ida_unlocked_open(struct block_device *bdev, fmode_t mode); -static void ida_release(struct gendisk *disk, fmode_t mode); -static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo); -static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io); - -static void do_ida_request(struct request_queue *q); -static void start_io(ctlr_info_t *h); - -static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c); -static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c); -static inline void complete_command(cmdlist_t *cmd, int timeout); - -static irqreturn_t do_ida_intr(int irq, void *dev_id); -static void ida_timer(unsigned long tdata); -static int ida_revalidate(struct gendisk *disk); -static int revalidate_allvol(ctlr_info_t *host); -static int cpqarray_register_ctlr(int ctlr, struct pci_dev *pdev); - -#ifdef CONFIG_PROC_FS -static void ida_procinit(int i); -#else -static void ida_procinit(int i) {} -#endif - -static inline drv_info_t *get_drv(struct gendisk *disk) -{ - return disk->private_data; -} - -static inline ctlr_info_t *get_host(struct gendisk *disk) -{ - return disk->queue->queuedata; -} - - -static const struct block_device_operations ida_fops = { - .owner = THIS_MODULE, - .open = ida_unlocked_open, - .release = ida_release, - .ioctl = ida_ioctl, - .getgeo = ida_getgeo, - .revalidate_disk= ida_revalidate, -}; - - -#ifdef CONFIG_PROC_FS - -static struct proc_dir_entry *proc_array; -static const struct file_operations ida_proc_fops; - -/* - * Get us a file in /proc/array that says something about each controller. - * Create /proc/array if it doesn't exist yet. - */ -static void __init ida_procinit(int i) -{ - if (proc_array == NULL) { - proc_array = proc_mkdir("driver/cpqarray", NULL); - if (!proc_array) return; - } - - proc_create_data(hba[i]->devname, 0, proc_array, &ida_proc_fops, hba[i]); -} - -/* - * Report information about this controller. - */ -static int ida_proc_show(struct seq_file *m, void *v) -{ - int i, ctlr; - ctlr_info_t *h = (ctlr_info_t*)m->private; - drv_info_t *drv; -#ifdef CPQ_PROC_PRINT_QUEUES - cmdlist_t *c; - unsigned long flags; -#endif - - ctlr = h->ctlr; - seq_printf(m, "%s: Compaq %s Controller\n" - " Board ID: 0x%08lx\n" - " Firmware Revision: %c%c%c%c\n" - " Controller Sig: 0x%08lx\n" - " Memory Address: 0x%08lx\n" - " I/O Port: 0x%04x\n" - " IRQ: %d\n" - " Logical drives: %d\n" - " Physical drives: %d\n\n" - " Current Q depth: %d\n" - " Max Q depth since init: %d\n\n", - h->devname, - h->product_name, - (unsigned long)h->board_id, - h->firm_rev[0], h->firm_rev[1], h->firm_rev[2], h->firm_rev[3], - (unsigned long)h->ctlr_sig, (unsigned long)h->vaddr, - (unsigned int) h->io_mem_addr, (unsigned int)h->intr, - h->log_drives, h->phys_drives, - h->Qdepth, h->maxQsinceinit); - - seq_puts(m, "Logical Drive Info:\n"); - - for(i=0; ilog_drives; i++) { - drv = &h->drv[i]; - seq_printf(m, "ida/c%dd%d: blksz=%d nr_blks=%d\n", - ctlr, i, drv->blk_size, drv->nr_blks); - } - -#ifdef CPQ_PROC_PRINT_QUEUES - spin_lock_irqsave(IDA_LOCK(h->ctlr), flags); - seq_puts(m, "\nCurrent Queues:\n"); - - c = h->reqQ; - seq_printf(m, "reqQ = %p", c); - if (c) c=c->next; - while(c && c != h->reqQ) { - seq_printf(m, "->%p", c); - c=c->next; - } - - c = h->cmpQ; - seq_printf(m, "\ncmpQ = %p", c); - if (c) c=c->next; - while(c && c != h->cmpQ) { - seq_printf(m, "->%p", c); - c=c->next; - } - - seq_putc(m, '\n'); - spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags); -#endif - seq_printf(m, "nr_allocs = %d\nnr_frees = %d\n", - h->nr_allocs, h->nr_frees); - return 0; -} - -static int ida_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, ida_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations ida_proc_fops = { - .owner = THIS_MODULE, - .open = ida_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROC_FS */ - -module_param_array(eisa, int, NULL, 0); - -static void release_io_mem(ctlr_info_t *c) -{ - /* if IO mem was not protected do nothing */ - if( c->io_mem_addr == 0) - return; - release_region(c->io_mem_addr, c->io_mem_length); - c->io_mem_addr = 0; - c->io_mem_length = 0; -} - -static void cpqarray_remove_one(int i) -{ - int j; - char buff[4]; - - /* sendcmd will turn off interrupt, and send the flush... - * To write all data in the battery backed cache to disks - * no data returned, but don't want to send NULL to sendcmd */ - if( sendcmd(FLUSH_CACHE, i, buff, 4, 0, 0, 0)) - { - printk(KERN_WARNING "Unable to flush cache on controller %d\n", - i); - } - free_irq(hba[i]->intr, hba[i]); - iounmap(hba[i]->vaddr); - unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname); - del_timer(&hba[i]->timer); - remove_proc_entry(hba[i]->devname, proc_array); - pci_free_consistent(hba[i]->pci_dev, - NR_CMDS * sizeof(cmdlist_t), (hba[i]->cmd_pool), - hba[i]->cmd_pool_dhandle); - kfree(hba[i]->cmd_pool_bits); - for(j = 0; j < NWD; j++) { - if (ida_gendisk[i][j]->flags & GENHD_FL_UP) - del_gendisk(ida_gendisk[i][j]); - put_disk(ida_gendisk[i][j]); - } - blk_cleanup_queue(hba[i]->queue); - release_io_mem(hba[i]); - free_hba(i); -} - -static void cpqarray_remove_one_pci(struct pci_dev *pdev) -{ - int i; - ctlr_info_t *tmp_ptr; - - if (pci_get_drvdata(pdev) == NULL) { - printk( KERN_ERR "cpqarray: Unable to remove device \n"); - return; - } - - tmp_ptr = pci_get_drvdata(pdev); - i = tmp_ptr->ctlr; - if (hba[i] == NULL) { - printk(KERN_ERR "cpqarray: controller %d appears to have" - "already been removed \n", i); - return; - } - pci_set_drvdata(pdev, NULL); - - cpqarray_remove_one(i); -} - -/* removing an instance that was not removed automatically.. - * must be an eisa card. - */ -static void cpqarray_remove_one_eisa(int i) -{ - if (hba[i] == NULL) { - printk(KERN_ERR "cpqarray: controller %d appears to have" - "already been removed \n", i); - return; - } - cpqarray_remove_one(i); -} - -/* pdev is NULL for eisa */ -static int cpqarray_register_ctlr(int i, struct pci_dev *pdev) -{ - struct request_queue *q; - int j; - - /* - * register block devices - * Find disks and fill in structs - * Get an interrupt, set the Q depth and get into /proc - */ - - /* If this successful it should insure that we are the only */ - /* instance of the driver */ - if (register_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname)) { - goto Enomem4; - } - hba[i]->access.set_intr_mask(hba[i], 0); - if (request_irq(hba[i]->intr, do_ida_intr, IRQF_SHARED, - hba[i]->devname, hba[i])) - { - printk(KERN_ERR "cpqarray: Unable to get irq %d for %s\n", - hba[i]->intr, hba[i]->devname); - goto Enomem3; - } - - for (j=0; jcmd_pool = pci_alloc_consistent( - hba[i]->pci_dev, NR_CMDS * sizeof(cmdlist_t), - &(hba[i]->cmd_pool_dhandle)); - hba[i]->cmd_pool_bits = kcalloc( - DIV_ROUND_UP(NR_CMDS, BITS_PER_LONG), sizeof(unsigned long), - GFP_KERNEL); - - if (!hba[i]->cmd_pool_bits || !hba[i]->cmd_pool) - goto Enomem1; - - memset(hba[i]->cmd_pool, 0, NR_CMDS * sizeof(cmdlist_t)); - printk(KERN_INFO "cpqarray: Finding drives on %s", - hba[i]->devname); - - spin_lock_init(&hba[i]->lock); - q = blk_init_queue(do_ida_request, &hba[i]->lock); - if (!q) - goto Enomem1; - - hba[i]->queue = q; - q->queuedata = hba[i]; - - getgeometry(i); - start_fwbk(i); - - ida_procinit(i); - - if (pdev) - blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); - - /* This is a hardware imposed limit. */ - blk_queue_max_segments(q, SG_MAX); - - init_timer(&hba[i]->timer); - hba[i]->timer.expires = jiffies + IDA_TIMER; - hba[i]->timer.data = (unsigned long)hba[i]; - hba[i]->timer.function = ida_timer; - add_timer(&hba[i]->timer); - - /* Enable IRQ now that spinlock and rate limit timer are set up */ - hba[i]->access.set_intr_mask(hba[i], FIFO_NOT_EMPTY); - - for(j=0; jdrv[j]; - sprintf(disk->disk_name, "ida/c%dd%d", i, j); - disk->major = COMPAQ_SMART2_MAJOR + i; - disk->first_minor = j<fops = &ida_fops; - if (j && !drv->nr_blks) - continue; - blk_queue_logical_block_size(hba[i]->queue, drv->blk_size); - set_capacity(disk, drv->nr_blks); - disk->queue = hba[i]->queue; - disk->private_data = drv; - add_disk(disk); - } - - /* done ! */ - return(i); - -Enomem1: - nr_ctlr = i; - kfree(hba[i]->cmd_pool_bits); - if (hba[i]->cmd_pool) - pci_free_consistent(hba[i]->pci_dev, NR_CMDS*sizeof(cmdlist_t), - hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle); -Enomem2: - while (j--) { - put_disk(ida_gendisk[i][j]); - ida_gendisk[i][j] = NULL; - } - free_irq(hba[i]->intr, hba[i]); -Enomem3: - unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname); -Enomem4: - if (pdev) - pci_set_drvdata(pdev, NULL); - release_io_mem(hba[i]); - free_hba(i); - - printk( KERN_ERR "cpqarray: out of memory"); - - return -1; -} - -static int cpqarray_init_one(struct pci_dev *pdev, - const struct pci_device_id *ent) -{ - int i; - - printk(KERN_DEBUG "cpqarray: Device 0x%x has been found at" - " bus %d dev %d func %d\n", - pdev->device, pdev->bus->number, PCI_SLOT(pdev->devfn), - PCI_FUNC(pdev->devfn)); - i = alloc_cpqarray_hba(); - if( i < 0 ) - return (-1); - memset(hba[i], 0, sizeof(ctlr_info_t)); - sprintf(hba[i]->devname, "ida%d", i); - hba[i]->ctlr = i; - /* Initialize the pdev driver private data */ - pci_set_drvdata(pdev, hba[i]); - - if (cpqarray_pci_init(hba[i], pdev) != 0) { - pci_set_drvdata(pdev, NULL); - release_io_mem(hba[i]); - free_hba(i); - return -1; - } - - return (cpqarray_register_ctlr(i, pdev)); -} - -static struct pci_driver cpqarray_pci_driver = { - .name = "cpqarray", - .probe = cpqarray_init_one, - .remove = cpqarray_remove_one_pci, - .id_table = cpqarray_pci_device_id, -}; - -/* - * This is it. Find all the controllers and register them. - * returns the number of block devices registered. - */ -static int __init cpqarray_init(void) -{ - int num_cntlrs_reg = 0; - int i; - int rc = 0; - - /* detect controllers */ - printk(DRIVER_NAME "\n"); - - rc = pci_register_driver(&cpqarray_pci_driver); - if (rc) - return rc; - cpqarray_eisa_detect(); - - for (i=0; i < MAX_CTLR; i++) { - if (hba[i] != NULL) - num_cntlrs_reg++; - } - - if (num_cntlrs_reg) - return 0; - else { - pci_unregister_driver(&cpqarray_pci_driver); - return -ENODEV; - } -} - -/* Function to find the first free pointer into our hba[] array */ -/* Returns -1 if no free entries are left. */ -static int alloc_cpqarray_hba(void) -{ - int i; - - for(i=0; i< MAX_CTLR; i++) { - if (hba[i] == NULL) { - hba[i] = kmalloc(sizeof(ctlr_info_t), GFP_KERNEL); - if(hba[i]==NULL) { - printk(KERN_ERR "cpqarray: out of memory.\n"); - return (-1); - } - return (i); - } - } - printk(KERN_WARNING "cpqarray: This driver supports a maximum" - " of 8 controllers.\n"); - return(-1); -} - -static void free_hba(int i) -{ - kfree(hba[i]); - hba[i]=NULL; -} - -/* - * Find the IO address of the controller, its IRQ and so forth. Fill - * in some basic stuff into the ctlr_info_t structure. - */ -static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) -{ - ushort vendor_id, device_id, command; - unchar cache_line_size, latency_timer; - unchar irq, revision; - unsigned long addr[6]; - __u32 board_id; - - int i; - - c->pci_dev = pdev; - pci_set_master(pdev); - if (pci_enable_device(pdev)) { - printk(KERN_ERR "cpqarray: Unable to Enable PCI device\n"); - return -1; - } - vendor_id = pdev->vendor; - device_id = pdev->device; - revision = pdev->revision; - irq = pdev->irq; - - for(i=0; i<6; i++) - addr[i] = pci_resource_start(pdev, i); - - if (pci_set_dma_mask(pdev, CPQARRAY_DMA_MASK) != 0) - { - printk(KERN_ERR "cpqarray: Unable to set DMA mask\n"); - return -1; - } - - pci_read_config_word(pdev, PCI_COMMAND, &command); - pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size); - pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer); - - pci_read_config_dword(pdev, 0x2c, &board_id); - - /* check to see if controller has been disabled */ - if(!(command & 0x02)) { - printk(KERN_WARNING - "cpqarray: controller appears to be disabled\n"); - return(-1); - } - -DBGINFO( - printk("vendor_id = %x\n", vendor_id); - printk("device_id = %x\n", device_id); - printk("command = %x\n", command); - for(i=0; i<6; i++) - printk("addr[%d] = %lx\n", i, addr[i]); - printk("revision = %x\n", revision); - printk("irq = %x\n", irq); - printk("cache_line_size = %x\n", cache_line_size); - printk("latency_timer = %x\n", latency_timer); - printk("board_id = %x\n", board_id); -); - - c->intr = irq; - - for(i=0; i<6; i++) { - if (pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE_IO) - { /* IO space */ - c->io_mem_addr = addr[i]; - c->io_mem_length = pci_resource_end(pdev, i) - - pci_resource_start(pdev, i) + 1; - if(!request_region( c->io_mem_addr, c->io_mem_length, - "cpqarray")) - { - printk( KERN_WARNING "cpqarray I/O memory range already in use addr %lx length = %ld\n", c->io_mem_addr, c->io_mem_length); - c->io_mem_addr = 0; - c->io_mem_length = 0; - } - break; - } - } - - c->paddr = 0; - for(i=0; i<6; i++) - if (!(pci_resource_flags(pdev, i) & - PCI_BASE_ADDRESS_SPACE_IO)) { - c->paddr = pci_resource_start (pdev, i); - break; - } - if (!c->paddr) - return -1; - c->vaddr = remap_pci_mem(c->paddr, 128); - if (!c->vaddr) - return -1; - c->board_id = board_id; - - for(i=0; iproduct_name = products[i].product_name; - c->access = *(products[i].access); - break; - } - } - if (i == NR_PRODUCTS) { - printk(KERN_WARNING "cpqarray: Sorry, I don't know how" - " to access the SMART Array controller %08lx\n", - (unsigned long)board_id); - return -1; - } - - return 0; -} - -/* - * Map (physical) PCI mem into (virtual) kernel space - */ -static void __iomem *remap_pci_mem(ulong base, ulong size) -{ - ulong page_base = ((ulong) base) & PAGE_MASK; - ulong page_offs = ((ulong) base) - page_base; - void __iomem *page_remapped = ioremap(page_base, page_offs+size); - - return (page_remapped ? (page_remapped + page_offs) : NULL); -} - -#ifndef MODULE -/* - * Config string is a comma separated set of i/o addresses of EISA cards. - */ -static int cpqarray_setup(char *str) -{ - int i, ints[9]; - - (void)get_options(str, ARRAY_SIZE(ints), ints); - - for(i=0; iio_mem_addr = eisa[i]; - hba[ctlr]->io_mem_length = 0x7FF; - if(!request_region(hba[ctlr]->io_mem_addr, - hba[ctlr]->io_mem_length, - "cpqarray")) - { - printk(KERN_WARNING "cpqarray: I/O range already in " - "use addr = %lx length = %ld\n", - hba[ctlr]->io_mem_addr, - hba[ctlr]->io_mem_length); - free_hba(ctlr); - continue; - } - - /* - * Read the config register to find our interrupt - */ - intr = inb(eisa[i]+0xCC0) >> 4; - if (intr & 1) intr = 11; - else if (intr & 2) intr = 10; - else if (intr & 4) intr = 14; - else if (intr & 8) intr = 15; - - hba[ctlr]->intr = intr; - sprintf(hba[ctlr]->devname, "ida%d", nr_ctlr); - hba[ctlr]->product_name = products[j].product_name; - hba[ctlr]->access = *(products[j].access); - hba[ctlr]->ctlr = ctlr; - hba[ctlr]->board_id = board_id; - hba[ctlr]->pci_dev = NULL; /* not PCI */ - -DBGINFO( - printk("i = %d, j = %d\n", i, j); - printk("irq = %x\n", intr); - printk("product name = %s\n", products[j].product_name); - printk("board_id = %x\n", board_id); -); - - num_ctlr++; - i++; - - if (cpqarray_register_ctlr(ctlr, NULL) == -1) - printk(KERN_WARNING - "cpqarray: Can't register EISA controller %d\n", - ctlr); - - } - - return num_ctlr; -} - -/* - * Open. Make sure the device is really there. - */ -static int ida_open(struct block_device *bdev, fmode_t mode) -{ - drv_info_t *drv = get_drv(bdev->bd_disk); - ctlr_info_t *host = get_host(bdev->bd_disk); - - DBGINFO(printk("ida_open %s\n", bdev->bd_disk->disk_name)); - /* - * Root is allowed to open raw volume zero even if it's not configured - * so array config can still work. I don't think I really like this, - * but I'm already using way to many device nodes to claim another one - * for "raw controller". - */ - if (!drv->nr_blks) { - if (!capable(CAP_SYS_RAWIO)) - return -ENXIO; - if (!capable(CAP_SYS_ADMIN) && drv != host->drv) - return -ENXIO; - } - host->usage_count++; - return 0; -} - -static int ida_unlocked_open(struct block_device *bdev, fmode_t mode) -{ - int ret; - - mutex_lock(&cpqarray_mutex); - ret = ida_open(bdev, mode); - mutex_unlock(&cpqarray_mutex); - - return ret; -} - -/* - * Close. Sync first. - */ -static void ida_release(struct gendisk *disk, fmode_t mode) -{ - ctlr_info_t *host; - - mutex_lock(&cpqarray_mutex); - host = get_host(disk); - host->usage_count--; - mutex_unlock(&cpqarray_mutex); -} - -/* - * Enqueuing and dequeuing functions for cmdlists. - */ -static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c) -{ - if (*Qptr == NULL) { - *Qptr = c; - c->next = c->prev = c; - } else { - c->prev = (*Qptr)->prev; - c->next = (*Qptr); - (*Qptr)->prev->next = c; - (*Qptr)->prev = c; - } -} - -static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c) -{ - if (c && c->next != c) { - if (*Qptr == c) *Qptr = c->next; - c->prev->next = c->next; - c->next->prev = c->prev; - } else { - *Qptr = NULL; - } - return c; -} - -/* - * Get a request and submit it to the controller. - * This routine needs to grab all the requests it possibly can from the - * req Q and submit them. Interrupts are off (and need to be off) when you - * are in here (either via the dummy do_ida_request functions or by being - * called from the interrupt handler - */ -static void do_ida_request(struct request_queue *q) -{ - ctlr_info_t *h = q->queuedata; - cmdlist_t *c; - struct request *creq; - struct scatterlist tmp_sg[SG_MAX]; - int i, dir, seg; - -queue_next: - creq = blk_peek_request(q); - if (!creq) - goto startio; - - BUG_ON(creq->nr_phys_segments > SG_MAX); - - if ((c = cmd_alloc(h,1)) == NULL) - goto startio; - - blk_start_request(creq); - - c->ctlr = h->ctlr; - c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv; - c->hdr.size = sizeof(rblk_t) >> 2; - c->size += sizeof(rblk_t); - - c->req.hdr.blk = blk_rq_pos(creq); - c->rq = creq; -DBGPX( - printk("sector=%d, nr_sectors=%u\n", - blk_rq_pos(creq), blk_rq_sectors(creq)); -); - sg_init_table(tmp_sg, SG_MAX); - seg = blk_rq_map_sg(q, creq, tmp_sg); - - /* Now do all the DMA Mappings */ - if (rq_data_dir(creq) == READ) - dir = PCI_DMA_FROMDEVICE; - else - dir = PCI_DMA_TODEVICE; - for( i=0; i < seg; i++) - { - c->req.sg[i].size = tmp_sg[i].length; - c->req.sg[i].addr = (__u32) pci_map_page(h->pci_dev, - sg_page(&tmp_sg[i]), - tmp_sg[i].offset, - tmp_sg[i].length, dir); - } -DBGPX( printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); ); - c->req.hdr.sg_cnt = seg; - c->req.hdr.blk_cnt = blk_rq_sectors(creq); - c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE; - c->type = CMD_RWREQ; - - /* Put the request on the tail of the request queue */ - addQ(&h->reqQ, c); - h->Qdepth++; - if (h->Qdepth > h->maxQsinceinit) - h->maxQsinceinit = h->Qdepth; - - goto queue_next; - -startio: - start_io(h); -} - -/* - * start_io submits everything on a controller's request queue - * and moves it to the completion queue. - * - * Interrupts had better be off if you're in here - */ -static void start_io(ctlr_info_t *h) -{ - cmdlist_t *c; - - while((c = h->reqQ) != NULL) { - /* Can't do anything if we're busy */ - if (h->access.fifo_full(h) == 0) - return; - - /* Get the first entry from the request Q */ - removeQ(&h->reqQ, c); - h->Qdepth--; - - /* Tell the controller to do our bidding */ - h->access.submit_command(h, c); - - /* Get onto the completion Q */ - addQ(&h->cmpQ, c); - } -} - -/* - * Mark all buffers that cmd was responsible for - */ -static inline void complete_command(cmdlist_t *cmd, int timeout) -{ - struct request *rq = cmd->rq; - int error = 0; - int i, ddir; - - if (cmd->req.hdr.rcode & RCODE_NONFATAL && - (hba[cmd->ctlr]->misc_tflags & MISC_NONFATAL_WARN) == 0) { - printk(KERN_NOTICE "Non Fatal error on ida/c%dd%d\n", - cmd->ctlr, cmd->hdr.unit); - hba[cmd->ctlr]->misc_tflags |= MISC_NONFATAL_WARN; - } - if (cmd->req.hdr.rcode & RCODE_FATAL) { - printk(KERN_WARNING "Fatal error on ida/c%dd%d\n", - cmd->ctlr, cmd->hdr.unit); - error = -EIO; - } - if (cmd->req.hdr.rcode & RCODE_INVREQ) { - printk(KERN_WARNING "Invalid request on ida/c%dd%d = (cmd=%x sect=%d cnt=%d sg=%d ret=%x)\n", - cmd->ctlr, cmd->hdr.unit, cmd->req.hdr.cmd, - cmd->req.hdr.blk, cmd->req.hdr.blk_cnt, - cmd->req.hdr.sg_cnt, cmd->req.hdr.rcode); - error = -EIO; - } - if (timeout) - error = -EIO; - /* unmap the DMA mapping for all the scatter gather elements */ - if (cmd->req.hdr.cmd == IDA_READ) - ddir = PCI_DMA_FROMDEVICE; - else - ddir = PCI_DMA_TODEVICE; - for(i=0; ireq.hdr.sg_cnt; i++) - pci_unmap_page(hba[cmd->ctlr]->pci_dev, cmd->req.sg[i].addr, - cmd->req.sg[i].size, ddir); - - DBGPX(printk("Done with %p\n", rq);); - __blk_end_request_all(rq, error); -} - -/* - * The controller will interrupt us upon completion of commands. - * Find the command on the completion queue, remove it, tell the OS and - * try to queue up more IO - */ -static irqreturn_t do_ida_intr(int irq, void *dev_id) -{ - ctlr_info_t *h = dev_id; - cmdlist_t *c; - unsigned long istat; - unsigned long flags; - __u32 a,a1; - - istat = h->access.intr_pending(h); - /* Is this interrupt for us? */ - if (istat == 0) - return IRQ_NONE; - - /* - * If there are completed commands in the completion queue, - * we had better do something about it. - */ - spin_lock_irqsave(IDA_LOCK(h->ctlr), flags); - if (istat & FIFO_NOT_EMPTY) { - while((a = h->access.command_completed(h))) { - a1 = a; a &= ~3; - if ((c = h->cmpQ) == NULL) - { - printk(KERN_WARNING "cpqarray: Completion of %08lx ignored\n", (unsigned long)a1); - continue; - } - while(c->busaddr != a) { - c = c->next; - if (c == h->cmpQ) - break; - } - /* - * If we've found the command, take it off the - * completion Q and free it - */ - if (c->busaddr == a) { - removeQ(&h->cmpQ, c); - /* Check for invalid command. - * Controller returns command error, - * But rcode = 0. - */ - - if((a1 & 0x03) && (c->req.hdr.rcode == 0)) - { - c->req.hdr.rcode = RCODE_INVREQ; - } - if (c->type == CMD_RWREQ) { - complete_command(c, 0); - cmd_free(h, c, 1); - } else if (c->type == CMD_IOCTL_PEND) { - c->type = CMD_IOCTL_DONE; - } - continue; - } - } - } - - /* - * See if we can queue up some more IO - */ - do_ida_request(h->queue); - spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags); - return IRQ_HANDLED; -} - -/* - * This timer was for timing out requests that haven't happened after - * IDA_TIMEOUT. That wasn't such a good idea. This timer is used to - * reset a flags structure so we don't flood the user with - * "Non-Fatal error" messages. - */ -static void ida_timer(unsigned long tdata) -{ - ctlr_info_t *h = (ctlr_info_t*)tdata; - - h->timer.expires = jiffies + IDA_TIMER; - add_timer(&h->timer); - h->misc_tflags = 0; -} - -static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - drv_info_t *drv = get_drv(bdev->bd_disk); - - if (drv->cylinders) { - geo->heads = drv->heads; - geo->sectors = drv->sectors; - geo->cylinders = drv->cylinders; - } else { - geo->heads = 0xff; - geo->sectors = 0x3f; - geo->cylinders = drv->nr_blks / (0xff*0x3f); - } - - return 0; -} - -/* - * ida_ioctl does some miscellaneous stuff like reporting drive geometry, - * setting readahead and submitting commands from userspace to the controller. - */ -static int ida_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) -{ - drv_info_t *drv = get_drv(bdev->bd_disk); - ctlr_info_t *host = get_host(bdev->bd_disk); - int error; - ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg; - ida_ioctl_t *my_io; - - switch(cmd) { - case IDAGETDRVINFO: - if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t))) - return -EFAULT; - return 0; - case IDAPASSTHRU: - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - my_io = kmalloc(sizeof(ida_ioctl_t), GFP_KERNEL); - if (!my_io) - return -ENOMEM; - error = -EFAULT; - if (copy_from_user(my_io, io, sizeof(*my_io))) - goto out_passthru; - error = ida_ctlr_ioctl(host, drv - host->drv, my_io); - if (error) - goto out_passthru; - error = -EFAULT; - if (copy_to_user(io, my_io, sizeof(*my_io))) - goto out_passthru; - error = 0; -out_passthru: - kfree(my_io); - return error; - case IDAGETCTLRSIG: - if (!arg) return -EINVAL; - if (put_user(host->ctlr_sig, (int __user *)arg)) - return -EFAULT; - return 0; - case IDAREVALIDATEVOLS: - if (MINOR(bdev->bd_dev) != 0) - return -ENXIO; - return revalidate_allvol(host); - case IDADRIVERVERSION: - if (!arg) return -EINVAL; - if (put_user(DRIVER_VERSION, (unsigned long __user *)arg)) - return -EFAULT; - return 0; - case IDAGETPCIINFO: - { - - ida_pci_info_struct pciinfo; - - if (!arg) return -EINVAL; - memset(&pciinfo, 0, sizeof(pciinfo)); - pciinfo.bus = host->pci_dev->bus->number; - pciinfo.dev_fn = host->pci_dev->devfn; - pciinfo.board_id = host->board_id; - if(copy_to_user((void __user *) arg, &pciinfo, - sizeof( ida_pci_info_struct))) - return -EFAULT; - return(0); - } - - default: - return -EINVAL; - } - -} - -static int ida_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long param) -{ - int ret; - - mutex_lock(&cpqarray_mutex); - ret = ida_locked_ioctl(bdev, mode, cmd, param); - mutex_unlock(&cpqarray_mutex); - - return ret; -} - -/* - * ida_ctlr_ioctl is for passing commands to the controller from userspace. - * The command block (io) has already been copied to kernel space for us, - * however, any elements in the sglist need to be copied to kernel space - * or copied back to userspace. - * - * Only root may perform a controller passthru command, however I'm not doing - * any serious sanity checking on the arguments. Doing an IDA_WRITE_MEDIA and - * putting a 64M buffer in the sglist is probably a *bad* idea. - */ -static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io) -{ - int ctlr = h->ctlr; - cmdlist_t *c; - void *p = NULL; - unsigned long flags; - int error; - - if ((c = cmd_alloc(h, 0)) == NULL) - return -ENOMEM; - c->ctlr = ctlr; - c->hdr.unit = (io->unit & UNITVALID) ? (io->unit & ~UNITVALID) : dsk; - c->hdr.size = sizeof(rblk_t) >> 2; - c->size += sizeof(rblk_t); - - c->req.hdr.cmd = io->cmd; - c->req.hdr.blk = io->blk; - c->req.hdr.blk_cnt = io->blk_cnt; - c->type = CMD_IOCTL_PEND; - - /* Pre submit processing */ - switch(io->cmd) { - case PASSTHRU_A: - p = memdup_user(io->sg[0].addr, io->sg[0].size); - if (IS_ERR(p)) { - error = PTR_ERR(p); - cmd_free(h, c, 0); - return error; - } - c->req.hdr.blk = pci_map_single(h->pci_dev, &(io->c), - sizeof(ida_ioctl_t), - PCI_DMA_BIDIRECTIONAL); - c->req.sg[0].size = io->sg[0].size; - c->req.sg[0].addr = pci_map_single(h->pci_dev, p, - c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - c->req.hdr.sg_cnt = 1; - break; - case IDA_READ: - case READ_FLASH_ROM: - case SENSE_CONTROLLER_PERFORMANCE: - p = kmalloc(io->sg[0].size, GFP_KERNEL); - if (!p) - { - error = -ENOMEM; - cmd_free(h, c, 0); - return(error); - } - - c->req.sg[0].size = io->sg[0].size; - c->req.sg[0].addr = pci_map_single(h->pci_dev, p, - c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - c->req.hdr.sg_cnt = 1; - break; - case IDA_WRITE: - case IDA_WRITE_MEDIA: - case DIAG_PASS_THRU: - case COLLECT_BUFFER: - case WRITE_FLASH_ROM: - p = memdup_user(io->sg[0].addr, io->sg[0].size); - if (IS_ERR(p)) { - error = PTR_ERR(p); - cmd_free(h, c, 0); - return error; - } - c->req.sg[0].size = io->sg[0].size; - c->req.sg[0].addr = pci_map_single(h->pci_dev, p, - c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - c->req.hdr.sg_cnt = 1; - break; - default: - c->req.sg[0].size = sizeof(io->c); - c->req.sg[0].addr = pci_map_single(h->pci_dev,&io->c, - c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - c->req.hdr.sg_cnt = 1; - } - - /* Put the request on the tail of the request queue */ - spin_lock_irqsave(IDA_LOCK(ctlr), flags); - addQ(&h->reqQ, c); - h->Qdepth++; - start_io(h); - spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); - - /* Wait for completion */ - while(c->type != CMD_IOCTL_DONE) - schedule(); - - /* Unmap the DMA */ - pci_unmap_single(h->pci_dev, c->req.sg[0].addr, c->req.sg[0].size, - PCI_DMA_BIDIRECTIONAL); - /* Post submit processing */ - switch(io->cmd) { - case PASSTHRU_A: - pci_unmap_single(h->pci_dev, c->req.hdr.blk, - sizeof(ida_ioctl_t), - PCI_DMA_BIDIRECTIONAL); - case IDA_READ: - case DIAG_PASS_THRU: - case SENSE_CONTROLLER_PERFORMANCE: - case READ_FLASH_ROM: - if (copy_to_user(io->sg[0].addr, p, io->sg[0].size)) { - kfree(p); - return -EFAULT; - } - /* fall through and free p */ - case IDA_WRITE: - case IDA_WRITE_MEDIA: - case COLLECT_BUFFER: - case WRITE_FLASH_ROM: - kfree(p); - break; - default:; - /* Nothing to do */ - } - - io->rcode = c->req.hdr.rcode; - cmd_free(h, c, 0); - return(0); -} - -/* - * Commands are pre-allocated in a large block. Here we use a simple bitmap - * scheme to suballocte them to the driver. Operations that are not time - * critical (and can wait for kmalloc and possibly sleep) can pass in NULL - * as the first argument to get a new command. - */ -static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool) -{ - cmdlist_t * c; - int i; - dma_addr_t cmd_dhandle; - - if (!get_from_pool) { - c = (cmdlist_t*)pci_alloc_consistent(h->pci_dev, - sizeof(cmdlist_t), &cmd_dhandle); - if(c==NULL) - return NULL; - } else { - do { - i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS); - if (i == NR_CMDS) - return NULL; - } while(test_and_set_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)) != 0); - c = h->cmd_pool + i; - cmd_dhandle = h->cmd_pool_dhandle + i*sizeof(cmdlist_t); - h->nr_allocs++; - } - - memset(c, 0, sizeof(cmdlist_t)); - c->busaddr = cmd_dhandle; - return c; -} - -static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool) -{ - int i; - - if (!got_from_pool) { - pci_free_consistent(h->pci_dev, sizeof(cmdlist_t), c, - c->busaddr); - } else { - i = c - h->cmd_pool; - clear_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)); - h->nr_frees++; - } -} - -/*********************************************************************** - name: sendcmd - Send a command to an IDA using the memory mapped FIFO interface - and wait for it to complete. - This routine should only be called at init time. -***********************************************************************/ -static int sendcmd( - __u8 cmd, - int ctlr, - void *buff, - size_t size, - unsigned int blk, - unsigned int blkcnt, - unsigned int log_unit ) -{ - cmdlist_t *c; - int complete; - unsigned long temp; - unsigned long i; - ctlr_info_t *info_p = hba[ctlr]; - - c = cmd_alloc(info_p, 1); - if(!c) - return IO_ERROR; - c->ctlr = ctlr; - c->hdr.unit = log_unit; - c->hdr.prio = 0; - c->hdr.size = sizeof(rblk_t) >> 2; - c->size += sizeof(rblk_t); - - /* The request information. */ - c->req.hdr.next = 0; - c->req.hdr.rcode = 0; - c->req.bp = 0; - c->req.hdr.sg_cnt = 1; - c->req.hdr.reserved = 0; - - if (size == 0) - c->req.sg[0].size = 512; - else - c->req.sg[0].size = size; - - c->req.hdr.blk = blk; - c->req.hdr.blk_cnt = blkcnt; - c->req.hdr.cmd = (unsigned char) cmd; - c->req.sg[0].addr = (__u32) pci_map_single(info_p->pci_dev, - buff, c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - /* - * Disable interrupt - */ - info_p->access.set_intr_mask(info_p, 0); - /* Make sure there is room in the command FIFO */ - /* Actually it should be completely empty at this time. */ - for (i = 200000; i > 0; i--) { - temp = info_p->access.fifo_full(info_p); - if (temp != 0) { - break; - } - udelay(10); -DBG( - printk(KERN_WARNING "cpqarray ida%d: idaSendPciCmd FIFO full," - " waiting!\n", ctlr); -); - } - /* - * Send the cmd - */ - info_p->access.submit_command(info_p, c); - complete = pollcomplete(ctlr); - - pci_unmap_single(info_p->pci_dev, (dma_addr_t) c->req.sg[0].addr, - c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); - if (complete != 1) { - if (complete != c->busaddr) { - printk( KERN_WARNING - "cpqarray ida%d: idaSendPciCmd " - "Invalid command list address returned! (%08lx)\n", - ctlr, (unsigned long)complete); - cmd_free(info_p, c, 1); - return (IO_ERROR); - } - } else { - printk( KERN_WARNING - "cpqarray ida%d: idaSendPciCmd Timeout out, " - "No command list address returned!\n", - ctlr); - cmd_free(info_p, c, 1); - return (IO_ERROR); - } - - if (c->req.hdr.rcode & 0x00FE) { - if (!(c->req.hdr.rcode & BIG_PROBLEM)) { - printk( KERN_WARNING - "cpqarray ida%d: idaSendPciCmd, error: " - "Controller failed at init time " - "cmd: 0x%x, return code = 0x%x\n", - ctlr, c->req.hdr.cmd, c->req.hdr.rcode); - - cmd_free(info_p, c, 1); - return (IO_ERROR); - } - } - cmd_free(info_p, c, 1); - return (IO_OK); -} - -/* - * revalidate_allvol is for online array config utilities. After a - * utility reconfigures the drives in the array, it can use this function - * (through an ioctl) to make the driver zap any previous disk structs for - * that controller and get new ones. - * - * Right now I'm using the getgeometry() function to do this, but this - * function should probably be finer grained and allow you to revalidate one - * particualar logical volume (instead of all of them on a particular - * controller). - */ -static int revalidate_allvol(ctlr_info_t *host) -{ - int ctlr = host->ctlr; - int i; - unsigned long flags; - - spin_lock_irqsave(IDA_LOCK(ctlr), flags); - if (host->usage_count > 1) { - spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); - printk(KERN_WARNING "cpqarray: Device busy for volume" - " revalidation (usage=%d)\n", host->usage_count); - return -EBUSY; - } - host->usage_count++; - spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); - - /* - * Set the partition and block size structures for all volumes - * on this controller to zero. We will reread all of this data - */ - set_capacity(ida_gendisk[ctlr][0], 0); - for (i = 1; i < NWD; i++) { - struct gendisk *disk = ida_gendisk[ctlr][i]; - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - } - memset(host->drv, 0, sizeof(drv_info_t)*NWD); - - /* - * Tell the array controller not to give us any interrupts while - * we check the new geometry. Then turn interrupts back on when - * we're done. - */ - host->access.set_intr_mask(host, 0); - getgeometry(ctlr); - host->access.set_intr_mask(host, FIFO_NOT_EMPTY); - - for(i=0; idrv[i]; - if (i && !drv->nr_blks) - continue; - blk_queue_logical_block_size(host->queue, drv->blk_size); - set_capacity(disk, drv->nr_blks); - disk->queue = host->queue; - disk->private_data = drv; - if (i) - add_disk(disk); - } - - host->usage_count--; - return 0; -} - -static int ida_revalidate(struct gendisk *disk) -{ - drv_info_t *drv = disk->private_data; - set_capacity(disk, drv->nr_blks); - return 0; -} - -/******************************************************************** - name: pollcomplete - Wait polling for a command to complete. - The memory mapped FIFO is polled for the completion. - Used only at init time, interrupts disabled. - ********************************************************************/ -static int pollcomplete(int ctlr) -{ - int done; - int i; - - /* Wait (up to 2 seconds) for a command to complete */ - - for (i = 200000; i > 0; i--) { - done = hba[ctlr]->access.command_completed(hba[ctlr]); - if (done == 0) { - udelay(10); /* a short fixed delay */ - } else - return (done); - } - /* Invalid address to tell caller we ran out of time */ - return 1; -} -/***************************************************************** - start_fwbk - Starts controller firmwares background processing. - Currently only the Integrated Raid controller needs this done. - If the PCI mem address registers are written to after this, - data corruption may occur -*****************************************************************/ -static void start_fwbk(int ctlr) -{ - id_ctlr_t *id_ctlr_buf; - int ret_code; - - if( (hba[ctlr]->board_id != 0x40400E11) - && (hba[ctlr]->board_id != 0x40480E11) ) - - /* Not a Integrated Raid, so there is nothing for us to do */ - return; - printk(KERN_DEBUG "cpqarray: Starting firmware's background" - " processing\n"); - /* Command does not return anything, but idasend command needs a - buffer */ - id_ctlr_buf = kmalloc(sizeof(id_ctlr_t), GFP_KERNEL); - if(id_ctlr_buf==NULL) - { - printk(KERN_WARNING "cpqarray: Out of memory. " - "Unable to start background processing.\n"); - return; - } - ret_code = sendcmd(RESUME_BACKGROUND_ACTIVITY, ctlr, - id_ctlr_buf, 0, 0, 0, 0); - if(ret_code != IO_OK) - printk(KERN_WARNING "cpqarray: Unable to start" - " background processing\n"); - - kfree(id_ctlr_buf); -} -/***************************************************************** - getgeometry - Get ida logical volume geometry from the controller - This is a large bit of code which once existed in two flavors, - It is used only at init time. -*****************************************************************/ -static void getgeometry(int ctlr) -{ - id_log_drv_t *id_ldrive; - id_ctlr_t *id_ctlr_buf; - sense_log_drv_stat_t *id_lstatus_buf; - config_t *sense_config_buf; - unsigned int log_unit, log_index; - int ret_code, size; - drv_info_t *drv; - ctlr_info_t *info_p = hba[ctlr]; - int i; - - info_p->log_drv_map = 0; - - id_ldrive = kzalloc(sizeof(id_log_drv_t), GFP_KERNEL); - if (!id_ldrive) { - printk( KERN_ERR "cpqarray: out of memory.\n"); - goto err_0; - } - - id_ctlr_buf = kzalloc(sizeof(id_ctlr_t), GFP_KERNEL); - if (!id_ctlr_buf) { - printk( KERN_ERR "cpqarray: out of memory.\n"); - goto err_1; - } - - id_lstatus_buf = kzalloc(sizeof(sense_log_drv_stat_t), GFP_KERNEL); - if (!id_lstatus_buf) { - printk( KERN_ERR "cpqarray: out of memory.\n"); - goto err_2; - } - - sense_config_buf = kzalloc(sizeof(config_t), GFP_KERNEL); - if (!sense_config_buf) { - printk( KERN_ERR "cpqarray: out of memory.\n"); - goto err_3; - } - - info_p->phys_drives = 0; - info_p->log_drv_map = 0; - info_p->drv_assign_map = 0; - info_p->drv_spare_map = 0; - info_p->mp_failed_drv_map = 0; /* only initialized here */ - /* Get controllers info for this logical drive */ - ret_code = sendcmd(ID_CTLR, ctlr, id_ctlr_buf, 0, 0, 0, 0); - if (ret_code == IO_ERROR) { - /* - * If can't get controller info, set the logical drive map to 0, - * so the idastubopen will fail on all logical drives - * on the controller. - */ - printk(KERN_ERR "cpqarray: error sending ID controller\n"); - goto err_4; - } - - info_p->log_drives = id_ctlr_buf->nr_drvs; - for(i=0;i<4;i++) - info_p->firm_rev[i] = id_ctlr_buf->firm_rev[i]; - info_p->ctlr_sig = id_ctlr_buf->cfg_sig; - - printk(" (%s)\n", info_p->product_name); - /* - * Initialize logical drive map to zero - */ - log_index = 0; - /* - * Get drive geometry for all logical drives - */ - if (id_ctlr_buf->nr_drvs > 16) - printk(KERN_WARNING "cpqarray ida%d: This driver supports " - "16 logical drives per controller.\n. " - " Additional drives will not be " - "detected\n", ctlr); - - for (log_unit = 0; - (log_index < id_ctlr_buf->nr_drvs) - && (log_unit < NWD); - log_unit++) { - size = sizeof(sense_log_drv_stat_t); - - /* - Send "Identify logical drive status" cmd - */ - ret_code = sendcmd(SENSE_LOG_DRV_STAT, - ctlr, id_lstatus_buf, size, 0, 0, log_unit); - if (ret_code == IO_ERROR) { - /* - If can't get logical drive status, set - the logical drive map to 0, so the - idastubopen will fail for all logical drives - on the controller. - */ - info_p->log_drv_map = 0; - printk( KERN_WARNING - "cpqarray ida%d: idaGetGeometry - Controller" - " failed to report status of logical drive %d\n" - "Access to this controller has been disabled\n", - ctlr, log_unit); - goto err_4; - } - /* - Make sure the logical drive is configured - */ - if (id_lstatus_buf->status != LOG_NOT_CONF) { - ret_code = sendcmd(ID_LOG_DRV, ctlr, id_ldrive, - sizeof(id_log_drv_t), 0, 0, log_unit); - /* - If error, the bit for this - logical drive won't be set and - idastubopen will return error. - */ - if (ret_code != IO_ERROR) { - drv = &info_p->drv[log_unit]; - drv->blk_size = id_ldrive->blk_size; - drv->nr_blks = id_ldrive->nr_blks; - drv->cylinders = id_ldrive->drv.cyl; - drv->heads = id_ldrive->drv.heads; - drv->sectors = id_ldrive->drv.sect_per_track; - info_p->log_drv_map |= (1 << log_unit); - - printk(KERN_INFO "cpqarray ida/c%dd%d: blksz=%d nr_blks=%d\n", - ctlr, log_unit, drv->blk_size, drv->nr_blks); - ret_code = sendcmd(SENSE_CONFIG, - ctlr, sense_config_buf, - sizeof(config_t), 0, 0, log_unit); - if (ret_code == IO_ERROR) { - info_p->log_drv_map = 0; - printk(KERN_ERR "cpqarray: error sending sense config\n"); - goto err_4; - } - - info_p->phys_drives = - sense_config_buf->ctlr_phys_drv; - info_p->drv_assign_map - |= sense_config_buf->drv_asgn_map; - info_p->drv_assign_map - |= sense_config_buf->spare_asgn_map; - info_p->drv_spare_map - |= sense_config_buf->spare_asgn_map; - } /* end of if no error on id_ldrive */ - log_index = log_index + 1; - } /* end of if logical drive configured */ - } /* end of for log_unit */ - - /* Free all the buffers and return */ -err_4: - kfree(sense_config_buf); -err_3: - kfree(id_lstatus_buf); -err_2: - kfree(id_ctlr_buf); -err_1: - kfree(id_ldrive); -err_0: - return; -} - -static void __exit cpqarray_exit(void) -{ - int i; - - pci_unregister_driver(&cpqarray_pci_driver); - - /* Double check that all controller entries have been removed */ - for(i=0; i -#include -#include -#include -#endif - -#include "ida_cmd.h" - -#define IO_OK 0 -#define IO_ERROR 1 -#define NWD 16 -#define NWD_SHIFT 4 - -#define IDA_TIMER (5*HZ) -#define IDA_TIMEOUT (10*HZ) - -#define MISC_NONFATAL_WARN 0x01 - -typedef struct { - unsigned blk_size; - unsigned nr_blks; - unsigned cylinders; - unsigned heads; - unsigned sectors; - int usage_count; -} drv_info_t; - -#ifdef __KERNEL__ - -struct ctlr_info; -typedef struct ctlr_info ctlr_info_t; - -struct access_method { - void (*submit_command)(ctlr_info_t *h, cmdlist_t *c); - void (*set_intr_mask)(ctlr_info_t *h, unsigned long val); - unsigned long (*fifo_full)(ctlr_info_t *h); - unsigned long (*intr_pending)(ctlr_info_t *h); - unsigned long (*command_completed)(ctlr_info_t *h); -}; - -struct board_type { - __u32 board_id; - char *product_name; - struct access_method *access; -}; - -struct ctlr_info { - int ctlr; - char devname[8]; - __u32 log_drv_map; - __u32 drv_assign_map; - __u32 drv_spare_map; - __u32 mp_failed_drv_map; - - char firm_rev[4]; - int ctlr_sig; - - int log_drives; - int phys_drives; - - struct pci_dev *pci_dev; /* NULL if EISA */ - __u32 board_id; - char *product_name; - - void __iomem *vaddr; - unsigned long paddr; - unsigned long io_mem_addr; - unsigned long io_mem_length; - int intr; - int usage_count; - drv_info_t drv[NWD]; - struct proc_dir_entry *proc; - - struct access_method access; - - cmdlist_t *reqQ; - cmdlist_t *cmpQ; - cmdlist_t *cmd_pool; - dma_addr_t cmd_pool_dhandle; - unsigned long *cmd_pool_bits; - struct request_queue *queue; - spinlock_t lock; - - unsigned int Qdepth; - unsigned int maxQsinceinit; - - unsigned int nr_requests; - unsigned int nr_allocs; - unsigned int nr_frees; - struct timer_list timer; - unsigned int misc_tflags; -}; - -#define IDA_LOCK(i) (&hba[i]->lock) - -#endif - -#endif /* CPQARRAY_H */ diff --git a/drivers/block/ida_cmd.h b/drivers/block/ida_cmd.h deleted file mode 100644 index 98b5746b3089..000000000000 --- a/drivers/block/ida_cmd.h +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Disk Array driver for Compaq SMART2 Controllers - * Copyright 1998 Compaq Computer Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Questions/Comments/Bugfixes to iss_storagedev@hp.com - * - */ -#ifndef ARRAYCMD_H -#define ARRAYCMD_H - -#include -#if 0 -#include -#endif - -/* for the Smart Array 42XX cards */ -#define S42XX_REQUEST_PORT_OFFSET 0x40 -#define S42XX_REPLY_INTR_MASK_OFFSET 0x34 -#define S42XX_REPLY_PORT_OFFSET 0x44 -#define S42XX_INTR_STATUS 0x30 - -#define S42XX_INTR_OFF 0x08 -#define S42XX_INTR_PENDING 0x08 - -#define COMMAND_FIFO 0x04 -#define COMMAND_COMPLETE_FIFO 0x08 -#define INTR_MASK 0x0C -#define INTR_STATUS 0x10 -#define INTR_PENDING 0x14 - -#define FIFO_NOT_EMPTY 0x01 -#define FIFO_NOT_FULL 0x02 - -#define BIG_PROBLEM 0x40 -#define LOG_NOT_CONF 2 - -#pragma pack(1) -typedef struct { - __u32 size; - __u32 addr; -} sg_t; - -#define RCODE_NONFATAL 0x02 -#define RCODE_FATAL 0x04 -#define RCODE_INVREQ 0x10 -typedef struct { - __u16 next; - __u8 cmd; - __u8 rcode; - __u32 blk; - __u16 blk_cnt; - __u8 sg_cnt; - __u8 reserved; -} rhdr_t; - -#define SG_MAX 32 -typedef struct { - rhdr_t hdr; - sg_t sg[SG_MAX]; - __u32 bp; -} rblk_t; - -typedef struct { - __u8 unit; - __u8 prio; - __u16 size; -} chdr_t; - -#define CMD_RWREQ 0x00 -#define CMD_IOCTL_PEND 0x01 -#define CMD_IOCTL_DONE 0x02 - -typedef struct cmdlist { - chdr_t hdr; - rblk_t req; - __u32 size; - int retry_cnt; - __u32 busaddr; - int ctlr; - struct cmdlist *prev; - struct cmdlist *next; - struct request *rq; - int type; -} cmdlist_t; - -#define ID_CTLR 0x11 -typedef struct { - __u8 nr_drvs; - __u32 cfg_sig; - __u8 firm_rev[4]; - __u8 rom_rev[4]; - __u8 hw_rev; - __u32 bb_rev; - __u32 drv_present_map; - __u32 ext_drv_map; - __u32 board_id; - __u8 cfg_error; - __u32 non_disk_bits; - __u8 bad_ram_addr; - __u8 cpu_rev; - __u8 pdpi_rev; - __u8 epic_rev; - __u8 wcxc_rev; - __u8 marketing_rev; - __u8 ctlr_flags; - __u8 host_flags; - __u8 expand_dis; - __u8 scsi_chips; - __u32 max_req_blocks; - __u32 ctlr_clock; - __u8 drvs_per_bus; - __u16 big_drv_present_map[8]; - __u16 big_ext_drv_map[8]; - __u16 big_non_disk_map[8]; - __u16 task_flags; - __u8 icl_bus; - __u8 red_modes; - __u8 cur_red_mode; - __u8 red_ctlr_stat; - __u8 red_fail_reason; - __u8 reserved[403]; -} id_ctlr_t; - -typedef struct { - __u16 cyl; - __u8 heads; - __u8 xsig; - __u8 psectors; - __u16 wpre; - __u8 maxecc; - __u8 drv_ctrl; - __u16 pcyls; - __u8 pheads; - __u16 landz; - __u8 sect_per_track; - __u8 cksum; -} drv_param_t; - -#define ID_LOG_DRV 0x10 -typedef struct { - __u16 blk_size; - __u32 nr_blks; - drv_param_t drv; - __u8 fault_tol; - __u8 reserved; - __u8 bios_disable; -} id_log_drv_t; - -#define ID_LOG_DRV_EXT 0x18 -typedef struct { - __u32 log_drv_id; - __u8 log_drv_label[64]; - __u8 reserved[418]; -} id_log_drv_ext_t; - -#define SENSE_LOG_DRV_STAT 0x12 -typedef struct { - __u8 status; - __u32 fail_map; - __u16 read_err[32]; - __u16 write_err[32]; - __u8 drv_err_data[256]; - __u8 drq_timeout[32]; - __u32 blks_to_recover; - __u8 drv_recovering; - __u16 remap_cnt[32]; - __u32 replace_drv_map; - __u32 act_spare_map; - __u8 spare_stat; - __u8 spare_repl_map[32]; - __u32 repl_ok_map; - __u8 media_exch; - __u8 cache_fail; - __u8 expn_fail; - __u8 unit_flags; - __u16 big_fail_map[8]; - __u16 big_remap_map[128]; - __u16 big_repl_map[8]; - __u16 big_act_spare_map[8]; - __u8 big_spar_repl_map[128]; - __u16 big_repl_ok_map[8]; - __u8 big_drv_rebuild; - __u8 reserved[36]; -} sense_log_drv_stat_t; - -#define START_RECOVER 0x13 - -#define ID_PHYS_DRV 0x15 -typedef struct { - __u8 scsi_bus; - __u8 scsi_id; - __u16 blk_size; - __u32 nr_blks; - __u32 rsvd_blks; - __u8 drv_model[40]; - __u8 drv_sn[40]; - __u8 drv_fw[8]; - __u8 scsi_iq_bits; - __u8 compaq_drv_stmp; - __u8 last_fail; - __u8 phys_drv_flags; - __u8 phys_drv_flags1; - __u8 scsi_lun; - __u8 phys_drv_flags2; - __u8 reserved; - __u32 spi_speed_rules; - __u8 phys_connector[2]; - __u8 phys_box_on_bus; - __u8 phys_bay_in_box; -} id_phys_drv_t; - -#define BLINK_DRV_LEDS 0x16 -typedef struct { - __u32 blink_duration; - __u32 reserved; - __u8 blink[256]; - __u8 reserved1[248]; -} blink_drv_leds_t; - -#define SENSE_BLINK_LEDS 0x17 -typedef struct { - __u32 blink_duration; - __u32 btime_elap; - __u8 blink[256]; - __u8 reserved1[248]; -} sense_blink_leds_t; - -#define IDA_READ 0x20 -#define IDA_WRITE 0x30 -#define IDA_WRITE_MEDIA 0x31 -#define RESET_TO_DIAG 0x40 -#define DIAG_PASS_THRU 0x41 - -#define SENSE_CONFIG 0x50 -#define SET_CONFIG 0x51 -typedef struct { - __u32 cfg_sig; - __u16 compat_port; - __u8 data_dist_mode; - __u8 surf_an_ctrl; - __u16 ctlr_phys_drv; - __u16 log_unit_phys_drv; - __u16 fault_tol_mode; - __u8 phys_drv_param[16]; - drv_param_t drv; - __u32 drv_asgn_map; - __u16 dist_factor; - __u32 spare_asgn_map; - __u8 reserved[6]; - __u16 os; - __u8 ctlr_order; - __u8 extra_info; - __u32 data_offs; - __u8 parity_backedout_write_drvs; - __u8 parity_dist_mode; - __u8 parity_shift_fact; - __u8 bios_disable_flag; - __u32 blks_on_vol; - __u32 blks_per_drv; - __u8 scratch[16]; - __u16 big_drv_map[8]; - __u16 big_spare_map[8]; - __u8 ss_source_vol; - __u8 mix_drv_cap_range; - struct { - __u16 big_drv_map[8]; - __u32 blks_per_drv; - __u16 fault_tol_mode; - __u16 dist_factor; - } MDC_range[4]; - __u8 reserved1[248]; -} config_t; - -#define BYPASS_VOL_STATE 0x52 -#define SS_CREATE_VOL 0x53 -#define CHANGE_CONFIG 0x54 -#define SENSE_ORIG_CONF 0x55 -#define REORDER_LOG_DRV 0x56 -typedef struct { - __u8 old_units[32]; -} reorder_log_drv_t; - -#define LABEL_LOG_DRV 0x57 -typedef struct { - __u8 log_drv_label[64]; -} label_log_drv_t; - -#define SS_TO_VOL 0x58 - -#define SET_SURF_DELAY 0x60 -typedef struct { - __u16 delay; - __u8 reserved[510]; -} surf_delay_t; - -#define SET_OVERHEAT_DELAY 0x61 -typedef struct { - __u16 delay; -} overhead_delay_t; - -#define SET_MP_DELAY -typedef struct { - __u16 delay; - __u8 reserved[510]; -} mp_delay_t; - -#define PASSTHRU_A 0x91 -typedef struct { - __u8 target; - __u8 bus; - __u8 lun; - __u32 timeout; - __u32 flags; - __u8 status; - __u8 error; - __u8 cdb_len; - __u8 sense_error; - __u8 sense_key; - __u32 sense_info; - __u8 sense_code; - __u8 sense_qual; - __u32 residual; - __u8 reserved[4]; - __u8 cdb[12]; -} scsi_param_t; - -#define RESUME_BACKGROUND_ACTIVITY 0x99 -#define SENSE_CONTROLLER_PERFORMANCE 0xa8 -#define FLUSH_CACHE 0xc2 -#define COLLECT_BUFFER 0xd2 -#define READ_FLASH_ROM 0xf6 -#define WRITE_FLASH_ROM 0xf7 -#pragma pack() - -#endif /* ARRAYCMD_H */ diff --git a/drivers/block/ida_ioctl.h b/drivers/block/ida_ioctl.h deleted file mode 100644 index 888fff9caed0..000000000000 --- a/drivers/block/ida_ioctl.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Disk Array driver for Compaq SMART2 Controllers - * Copyright 1998 Compaq Computer Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Questions/Comments/Bugfixes to iss_storagedev@hp.com - * - */ -#ifndef IDA_IOCTL_H -#define IDA_IOCTL_H - -#include "ida_cmd.h" -#include "cpqarray.h" - -#define IDAGETDRVINFO 0x27272828 -#define IDAPASSTHRU 0x28282929 -#define IDAGETCTLRSIG 0x29293030 -#define IDAREVALIDATEVOLS 0x30303131 -#define IDADRIVERVERSION 0x31313232 -#define IDAGETPCIINFO 0x32323333 - -typedef struct _ida_pci_info_struct -{ - unsigned char bus; - unsigned char dev_fn; - __u32 board_id; -} ida_pci_info_struct; -/* - * Normally, the ioctl determines the logical unit for this command by - * the major,minor number of the fd passed to ioctl. If you need to send - * a command to a different/nonexistant unit (such as during config), you - * can override the normal behavior by setting the unit valid bit. (Normally, - * it should be zero) The controller the command is sent to is still - * determined by the major number of the open device. - */ - -#define UNITVALID 0x80 -typedef struct { - __u8 cmd; - __u8 rcode; - __u8 unit; - __u32 blk; - __u16 blk_cnt; - -/* currently, sg_cnt is assumed to be 1: only the 0th element of sg is used */ - struct { - void __user *addr; - size_t size; - } sg[SG_MAX]; - int sg_cnt; - - union ctlr_cmds { - drv_info_t drv; - unsigned char buf[1024]; - - id_ctlr_t id_ctlr; - drv_param_t drv_param; - id_log_drv_t id_log_drv; - id_log_drv_ext_t id_log_drv_ext; - sense_log_drv_stat_t sense_log_drv_stat; - id_phys_drv_t id_phys_drv; - blink_drv_leds_t blink_drv_leds; - sense_blink_leds_t sense_blink_leds; - config_t config; - reorder_log_drv_t reorder_log_drv; - label_log_drv_t label_log_drv; - surf_delay_t surf_delay; - overhead_delay_t overhead_delay; - mp_delay_t mp_delay; - scsi_param_t scsi_param; - } c; -} ida_ioctl_t; - -#endif /* IDA_IOCTL_H */ From 5e4298be45e83ecdffaabb370eea9396889b07f1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Dec 2015 16:38:22 +0100 Subject: [PATCH 47/49] brd: Fix discard request processing Avoid that discard requests with size => PAGE_SIZE fail with -EIO. Refuse discard requests if the discard size is not a multiple of the page size. Fixes: 2dbe54957636 ("brd: Refuse improperly aligned discard requests") Signed-off-by: Bart Van Assche Reviewed-by: Jan Kara Cc: Christoph Hellwig Cc: Robert Elliot Cc: stable # v4.4+ Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index cb27190e9f39..f7ecc287d733 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -341,7 +341,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) if (unlikely(bio->bi_rw & REQ_DISCARD)) { if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) || - bio->bi_iter.bi_size & PAGE_MASK) + bio->bi_iter.bi_size & ~PAGE_MASK) goto io_error; discard_from_brd(brd, sector, bio->bi_iter.bi_size); goto out; From 98347a7d8a93ef8a01c8d1946a2059f38f73b239 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 15 Mar 2016 09:21:00 +0100 Subject: [PATCH 48/49] drivers:block: cpqarray clean up Commit d436641439e0 ("cpqarray: remove it from the kernel") removes the Kconfig option BLK_CPQ_DA and cpqarray. Remove the dead build rule in the Makefile. Signed-off-by: Valentin Rothberg Signed-off-by: Jens Axboe --- drivers/block/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 671329023ec2..1e9661e26f29 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o -obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o obj-$(CONFIG_XILINX_SYSACE) += xsysace.o From 118472ab8532e55f48395ef5764b354fe48b1d73 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 18 Feb 2016 09:57:48 -0700 Subject: [PATCH 49/49] NVMe: Expose ns wwid through single sysfs entry The method to uniquely identify a namespace depends on the controller's specification revision level and implemented capabilities. This patch has the driver figure this out and exports the unique string through a single 'wwid' attribute so the user doesn't have this burden. The longest namespace unique identifier is used if available. If not available, the driver will concat the controller's vendor, serial, and model with the namespace ID. The specification provides this as a unique indentifier. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 26 ++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 27 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4304be00e556..266918b9bb84 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -915,6 +915,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; @@ -1053,6 +1054,30 @@ static ssize_t nvme_sysfs_reset(struct device *dev, } static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); +static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; + int serial_len = sizeof(ctrl->serial); + int model_len = sizeof(ctrl->model); + + if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return sprintf(buf, "eui.%16phN\n", ns->uuid); + + if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return sprintf(buf, "eui.%8phN\n", ns->eui); + + while (ctrl->serial[serial_len - 1] == ' ') + serial_len--; + while (ctrl->model[model_len - 1] == ' ') + model_len--; + + return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, + serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); +} +static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1078,6 +1103,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); static struct attribute *nvme_ns_attrs[] = { + &dev_attr_wwid.attr, &dev_attr_uuid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a402a0ebf471..bf3f143e975b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -91,6 +91,7 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u16 vid; atomic_t abort_limit; u8 event_limit; u8 vwc;