From 5de8acb41c86f1d335d165e0a350441ea3a1f480 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 29 May 2024 17:09:07 +0200 Subject: [PATCH 01/32] fuse: cleanup request queuing towards virtiofs Virtiofs has its own queuing mechanism, but still requests are first queued on fiq->pending to be immediately dequeued and queued onto the virtio queue. The queuing on fiq->pending is unnecessary and might even have some performance impact due to being a contention point. Forget requests are handled similarly. Move the queuing of requests and forgets into the fiq->ops->*. fuse_iqueue_ops are renamed to reflect the new semantics. Reviewed-by: Stefan Hajnoczi Fixed-by: Jingbo Xu Reviewed-by: Jingbo Xu Tested-by: Peter-Jan Gootzen Reviewed-by: Peter-Jan Gootzen Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 159 ++++++++++++++++++++++++-------------------- fs/fuse/fuse_i.h | 19 ++---- fs/fuse/virtio_fs.c | 41 ++++-------- 3 files changed, 106 insertions(+), 113 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 67443ef07285..2364df0324f7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -194,11 +194,22 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -u64 fuse_get_unique(struct fuse_iqueue *fiq) +static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + u64 ret; + + spin_lock(&fiq->lock); + ret = fuse_get_unique_locked(fiq); + spin_unlock(&fiq->lock); + + return ret; +} EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) @@ -217,22 +228,68 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fuse_dev_wake_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } else { + fuse_dev_wake_and_unlock(fiq); + } + } else { + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + list_add_tail(&req->list, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + fuse_request_end(req); + } +} + const struct fuse_iqueue_ops fuse_dev_fiq_ops = { - .wake_forget_and_unlock = fuse_dev_wake_and_unlock, - .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, - .wake_pending_and_unlock = fuse_dev_wake_and_unlock, + .send_forget = fuse_dev_queue_forget, + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_dev_queue_req, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) -__releases(fiq->lock) +static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fiq->ops->send_req(fiq, req); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -243,15 +300,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->lock); - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); - } else { - kfree(forget); - spin_unlock(&fiq->lock); - } + fiq->ops->send_forget(fiq, forget); } static void flush_bg_queue(struct fuse_conn *fc) @@ -265,9 +314,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->lock); - req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + fuse_send_one(fiq, req); } } @@ -337,29 +384,12 @@ static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; - spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ - if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->lock); + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) return -EINVAL; - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); - /* - * Pairs with smp_mb() implied by test_and_set_bit() - * from fuse_request_end(). - */ - smp_mb(); - if (test_bit(FR_FINISHED, &req->flags)) { - list_del_init(&req->intr_entry); - spin_unlock(&fiq->lock); - return 0; - } - fiq->ops->wake_interrupt_and_unlock(fiq); - } else { - spin_unlock(&fiq->lock); - } + fiq->ops->send_interrupt(fiq, req); + return 0; } @@ -414,21 +444,15 @@ static void __fuse_request_send(struct fuse_req *req) struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->lock); - if (!fiq->connected) { - spin_unlock(&fiq->lock); - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); - /* acquire extra reference, since request is still needed - after fuse_request_end() */ - __fuse_get_request(req); - queue_request_and_unlock(fiq, req); - request_wait_answer(req); - /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } + /* acquire extra reference, since request is still needed after + fuse_request_end() */ + __fuse_get_request(req); + fuse_send_one(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) @@ -583,7 +607,6 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - int err = 0; req = fuse_get_req(fm, false); if (IS_ERR(req)) @@ -594,16 +617,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, fuse_args_to_req(req, args); - spin_lock(&fiq->lock); - if (fiq->connected) { - queue_request_and_unlock(fiq, req); - } else { - err = -ENODEV; - spin_unlock(&fiq->lock); - fuse_put_request(req); - } + fuse_send_one(fiq, req); - return err; + return 0; } /* @@ -1075,9 +1091,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp) +static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1096,7 +1112,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, return head; } -EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1111,7 +1126,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1142,7 +1157,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1828,7 +1843,7 @@ static void fuse_resend(struct fuse_conn *fc) } /* iq and pq requests are both oldest to newest */ list_splice(&to_queue, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + fuse_dev_wake_and_unlock(fiq); } static int fuse_notify_resend(struct fuse_conn *fc) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f23919610313..33b21255817e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -449,22 +449,19 @@ struct fuse_iqueue; */ struct fuse_iqueue_ops { /** - * Signal that a forget has been queued + * Send one forget */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link); /** - * Signal that an INTERRUPT request has been queued + * Send interrupt for request */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req); /** - * Signal that a request has been queued + * Send one request */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req); /** * Clean up when fuse_iqueue is destroyed @@ -1053,10 +1050,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp); - /* * Initialize READ or READDIR request */ diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index dd5260141615..2fee9eb5ad0b 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1091,22 +1091,13 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { - struct fuse_forget_link *link; struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; - struct virtio_fs *fs; - struct virtio_fs_vq *fsvq; - u64 unique; - - link = fuse_dequeue_forget(fiq, 1, NULL); - unique = fuse_get_unique(fiq); - - fs = fiq->priv; - fsvq = &fs->vqs[VQ_HIPRIO]; - spin_unlock(&fiq->lock); + struct virtio_fs *fs = fiq->priv; + struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; + u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); @@ -1126,8 +1117,7 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. @@ -1136,7 +1126,6 @@ __releases(fiq->lock) * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ - spin_unlock(&fiq->lock); } /* Count number of scatter-gather elements required */ @@ -1341,21 +1330,17 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { unsigned int queue_id; struct virtio_fs *fs; - struct fuse_req *req; struct virtio_fs_vq *fsvq; int ret; - WARN_ON(list_empty(&fiq->pending)); - req = list_last_entry(&fiq->pending, struct fuse_req, list); + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + clear_bit(FR_PENDING, &req->flags); - list_del_init(&req->list); - WARN_ON(!list_empty(&fiq->pending)); - spin_unlock(&fiq->lock); fs = fiq->priv; queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()]; @@ -1393,10 +1378,10 @@ __releases(fiq->lock) } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { - .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, - .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, - .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, - .release = virtio_fs_fiq_release, + .send_forget = virtio_fs_send_forget, + .send_interrupt = virtio_fs_send_interrupt, + .send_req = virtio_fs_send_req, + .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) From ac5cffec53be0b0231b89470a357bd3a5814f599 Mon Sep 17 00:00:00 2001 From: yangyun Date: Wed, 14 Aug 2024 17:36:00 +0800 Subject: [PATCH 02/32] fuse: add fast path for fuse_range_is_writeback In some cases, the fi->writepages may be empty. And there is no need to check fi->writepages with spin_lock, which may have an impact on performance due to lock contention. For example, in scenarios where multiple readers read the same file without any writers, or where the page cache is not enabled. Also remove the outdated comment since commit 6b2fb79963fb ("fuse: optimize writepages search") has optimize the situation by replacing list with rb-tree. Signed-off-by: yangyun Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ed76121f73f2..b41c032d1c6d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -448,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -458,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); From 509a6458b44f72bb6854854c89cf76e56f11c9f1 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:02 -0700 Subject: [PATCH 03/32] fuse: drop unused fuse_mount arg in fuse_writepage_finish() Drop the unused "struct fuse_mount *fm" arg in fuse_writepage_finish(). No functional changes added. Signed-off-by: Joanne Koong Reviewed-by: Jingbo Xu Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b41c032d1c6d..39d20c88283b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1769,8 +1769,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish(struct fuse_mount *fm, - struct fuse_writepage_args *wpa) +static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; @@ -1829,7 +1828,7 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); /* After rb_erase() aux request list is private */ @@ -1965,7 +1964,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } From c04e3b2118192384153b4eac595768e2ffb7ac4a Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:03 -0700 Subject: [PATCH 04/32] fuse: refactor finished writeback stats updates into helper function Move the logic for updating the bdi and page stats for a finished writeback into a separate helper function, where it can be called from both fuse_writepage_finish() and fuse_writepage_add() (in the case where there is already an auxiliary write request for the page). No functional changes added. Suggested by: Jingbo Xu Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 39d20c88283b..e0bc5528d6b5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1769,19 +1769,25 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } +static void fuse_writepage_finish_stat(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(page, NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); +} + static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_pages; i++) + fuse_writepage_finish_stat(inode, ap->pages[i]); + wake_up(&fi->page_waitq); } @@ -1833,14 +1839,9 @@ __acquires(fi->lock) /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { - struct backing_dev_info *bdi = inode_to_bdi(aux->inode); - next = aux->next; aux->next = NULL; - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]); fuse_writepage_free(aux); } @@ -2209,11 +2210,7 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]); fuse_writepage_free(new_wpa); } From 672c3b7457fcee9656c36a29a4b21ec4a652433e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:05 -0700 Subject: [PATCH 05/32] fuse: move initialization of fuse_file to fuse_writepages() instead of in callback Prior to this change, data->ff is checked and if not initialized then initialized in the fuse_writepages_fill() callback, which gets called for every dirty page in the address space mapping. This logic is better placed in the main fuse_writepages() caller where data.ff is initialized before walking the dirty pages. No functional changes added. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e0bc5528d6b5..9d4f8bdc3865 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2263,13 +2263,6 @@ static int fuse_writepages_fill(struct folio *folio, struct page *tmp_page; int err; - if (!data->ff) { - err = -EIO; - data->ff = fuse_write_file_get(fi); - if (!data->ff) - goto out_unlock; - } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; @@ -2348,13 +2341,13 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; - err = -EIO; if (fuse_is_bad(inode)) - goto out; + return -EIO; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) @@ -2362,7 +2355,9 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; - data.ff = NULL; + data.ff = fuse_write_file_get(fi); + if (!data.ff) + return -EIO; err = -ENOMEM; data.orig_pages = kcalloc(fc->max_pages, @@ -2376,11 +2371,10 @@ static int fuse_writepages(struct address_space *mapping, WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); } - if (data.ff) - fuse_file_put(data.ff, false); kfree(data.orig_pages); out: + fuse_file_put(data.ff, false); return err; } From 9a8ebcf5e04e6cc9472bfcdd90b2aeef35a2f8f6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:06 -0700 Subject: [PATCH 06/32] fuse: convert fuse_writepages_fill() to use a folio for its tmp page To pave the way for refactoring out the shared logic in fuse_writepages_fill() and fuse_writepage_locked(), this change converts the temporary page in fuse_writepages_fill() to use the folio API. This is similar to the change in commit e0887e095a80 ("fuse: Convert fuse_writepage_locked to take a folio"), which converted the tmp page in fuse_writepage_locked() to use the folio API. inc_node_page_state() is intentionally preserved here instead of converting to node_stat_add_folio() since it is updating the stat of the underlying page and to better maintain API symmetry with dec_node_page_stat() in fuse_writepage_finish_stat(). No functional changes added. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d4f8bdc3865..0653fac40782 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2260,7 +2260,7 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; + struct folio *tmp_folio; int err; if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { @@ -2269,8 +2269,8 @@ static int fuse_writepages_fill(struct folio *folio, } err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto out_unlock; /* @@ -2290,7 +2290,7 @@ static int fuse_writepages_fill(struct folio *folio, err = -ENOMEM; wpa = fuse_writepage_args_alloc(); if (!wpa) { - __free_page(tmp_page); + folio_put(tmp_folio); goto out_unlock; } fuse_writepage_add_to_bucket(fc, wpa); @@ -2308,14 +2308,14 @@ static int fuse_writepages_fill(struct folio *folio, } folio_start_writeback(folio); - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; + folio_copy(tmp_folio, folio); + ap->pages[ap->num_pages] = &tmp_folio->page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; data->orig_pages[ap->num_pages] = &folio->page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); err = 0; if (data->wpa) { From 4046d3adcca42b7678f11c71e46bd32bafb4dad1 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:07 -0700 Subject: [PATCH 07/32] fuse: move fuse file initialization to wpa allocation time Before this change, wpa->ia.ff is initialized with an acquired reference on the fuse file right before it submits the writeback request. If there are auxiliary writebacks, then the initialization and reference acquisition needs to also be set before we submit the auxiliary writeback request. To make the logic simpler and to pave the way for a subsequent refactoring of fuse_writepages_fill() and fuse_writepage_locked(), this change initializes and acquires wpa->ia.ff when the wpa is allocated. No functional changes added. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0653fac40782..be0d7fdf04d0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1762,8 +1762,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); - if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1936,7 +1935,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, wpa->next = next->next; next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); tree_insert(&fi->writepages, next); /* @@ -2155,7 +2153,6 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) int num_pages = wpa->ia.ap.num_pages; int i; - wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); @@ -2300,6 +2297,7 @@ static int fuse_writepages_fill(struct folio *folio, ap = &wpa->ia.ap; fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->ia.ff = fuse_file_get(data->ff); wpa->next = NULL; ap->args.in_pages = true; ap->args.end = fuse_writepage_end; From 0acad9289be33d324537d6c51988be0541b1139d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 26 Aug 2024 14:19:08 -0700 Subject: [PATCH 08/32] fuse: refactor out shared logic in fuse_writepages_fill() and fuse_writepage_locked() This change refactors the shared logic in fuse_writepages_fill() and fuse_writepages_locked() into two separate helper functions, fuse_writepage_args_page_fill() and fuse_writepage_args_setup(). No functional changes added. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 103 +++++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index be0d7fdf04d0..fc587f3bc56f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2047,49 +2047,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } +static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, + struct folio *tmp_folio, uint32_t page_index) +{ + struct inode *inode = folio->mapping->host; + struct fuse_args_pages *ap = &wpa->ia.ap; + + folio_copy(tmp_folio, folio); + + ap->pages[page_index] = &tmp_folio->page; + ap->descs[page_index].offset = 0; + ap->descs[page_index].length = PAGE_SIZE; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); +} + +static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + struct fuse_file *ff) +{ + struct inode *inode = folio->mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = fuse_writepage_args_alloc(); + if (!wpa) + return NULL; + + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->inode = inode; + wpa->ia.ff = ff; + + ap = &wpa->ia.ap; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + + return wpa; +} + static int fuse_writepage_locked(struct folio *folio) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; struct folio *tmp_folio; + struct fuse_file *ff; int error = -ENOMEM; - folio_start_writeback(folio); - - wpa = fuse_writepage_args_alloc(); - if (!wpa) - goto err; - ap = &wpa->ia.ap; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); if (!tmp_folio) - goto err_free; + goto err; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fi); - if (!wpa->ia.ff) + ff = fuse_write_file_get(fi); + if (!ff) goto err_nofile; - fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0); + wpa = fuse_writepage_args_setup(folio, ff); + error = -ENOMEM; + if (!wpa) + goto err_writepage_args; - folio_copy(tmp_folio, folio); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; + ap = &wpa->ia.ap; ap->num_pages = 1; - ap->pages[0] = &tmp_folio->page; - ap->descs[0].offset = 0; - ap->descs[0].length = PAGE_SIZE; - ap->args.end = fuse_writepage_end; - wpa->inode = inode; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); + folio_start_writeback(folio); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -2101,13 +2129,12 @@ static int fuse_writepage_locked(struct folio *folio) return 0; +err_writepage_args: + fuse_file_put(ff, false); err_nofile: folio_put(tmp_folio); -err_free: - kfree(wpa); err: mapping_set_error(folio->mapping, error); - folio_end_writeback(folio); return error; } @@ -2285,36 +2312,20 @@ static int fuse_writepages_fill(struct folio *folio, */ if (data->wpa == NULL) { err = -ENOMEM; - wpa = fuse_writepage_args_alloc(); + wpa = fuse_writepage_args_setup(folio, data->ff); if (!wpa) { folio_put(tmp_folio); goto out_unlock; } - fuse_writepage_add_to_bucket(fc, wpa); - + fuse_file_get(wpa->ia.ff); data->max_pages = 1; - ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->ia.ff = fuse_file_get(data->ff); - wpa->next = NULL; - ap->args.in_pages = true; - ap->args.end = fuse_writepage_end; - ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - folio_copy(tmp_folio, folio); - ap->pages[ap->num_pages] = &tmp_folio->page; - ap->descs[ap->num_pages].offset = 0; - ap->descs[ap->num_pages].length = PAGE_SIZE; + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages); data->orig_pages[ap->num_pages] = &folio->page; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); - err = 0; if (data->wpa) { /* From 396b209e405a571ce8e06d3760ffc3e389a944f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 3 Jul 2024 10:38:42 -0400 Subject: [PATCH 09/32] fuse: add simple request tracepoints I've been timing various fuse operations and it's quite annoying to do with kprobes. Add two tracepoints for sending and ending fuse requests to make it easier to debug and time various operations. Signed-off-by: Josef Bacik Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/Makefile | 3 + fs/fuse/dev.c | 5 ++ fs/fuse/fuse_trace.h | 132 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 fs/fuse/fuse_trace.h diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 6e0228c6d0cb..ce0ff7a9007b 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -3,6 +3,9 @@ # Makefile for the FUSE filesystem. # +# Needed for trace events +ccflags-y = -I$(src) + obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2364df0324f7..1b6088288a16 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -22,6 +22,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" + MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -289,6 +292,7 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -335,6 +339,7 @@ void fuse_request_end(struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + trace_fuse_request_end(req); /* * test_and_set_bit() implies smp_mb() between bit * changing and below FR_INTERRUPTED check. Pairs with diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h new file mode 100644 index 000000000000..bbe9ddd8c716 --- /dev/null +++ b/fs/fuse/fuse_trace.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fuse + +#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FUSE_H + +#include + +#define OPCODES \ + EM( FUSE_LOOKUP, "FUSE_LOOKUP") \ + EM( FUSE_FORGET, "FUSE_FORGET") \ + EM( FUSE_GETATTR, "FUSE_GETATTR") \ + EM( FUSE_SETATTR, "FUSE_SETATTR") \ + EM( FUSE_READLINK, "FUSE_READLINK") \ + EM( FUSE_SYMLINK, "FUSE_SYMLINK") \ + EM( FUSE_MKNOD, "FUSE_MKNOD") \ + EM( FUSE_MKDIR, "FUSE_MKDIR") \ + EM( FUSE_UNLINK, "FUSE_UNLINK") \ + EM( FUSE_RMDIR, "FUSE_RMDIR") \ + EM( FUSE_RENAME, "FUSE_RENAME") \ + EM( FUSE_LINK, "FUSE_LINK") \ + EM( FUSE_OPEN, "FUSE_OPEN") \ + EM( FUSE_READ, "FUSE_READ") \ + EM( FUSE_WRITE, "FUSE_WRITE") \ + EM( FUSE_STATFS, "FUSE_STATFS") \ + EM( FUSE_RELEASE, "FUSE_RELEASE") \ + EM( FUSE_FSYNC, "FUSE_FSYNC") \ + EM( FUSE_SETXATTR, "FUSE_SETXATTR") \ + EM( FUSE_GETXATTR, "FUSE_GETXATTR") \ + EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \ + EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \ + EM( FUSE_FLUSH, "FUSE_FLUSH") \ + EM( FUSE_INIT, "FUSE_INIT") \ + EM( FUSE_OPENDIR, "FUSE_OPENDIR") \ + EM( FUSE_READDIR, "FUSE_READDIR") \ + EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \ + EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \ + EM( FUSE_GETLK, "FUSE_GETLK") \ + EM( FUSE_SETLK, "FUSE_SETLK") \ + EM( FUSE_SETLKW, "FUSE_SETLKW") \ + EM( FUSE_ACCESS, "FUSE_ACCESS") \ + EM( FUSE_CREATE, "FUSE_CREATE") \ + EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \ + EM( FUSE_BMAP, "FUSE_BMAP") \ + EM( FUSE_DESTROY, "FUSE_DESTROY") \ + EM( FUSE_IOCTL, "FUSE_IOCTL") \ + EM( FUSE_POLL, "FUSE_POLL") \ + EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \ + EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \ + EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \ + EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \ + EM( FUSE_RENAME2, "FUSE_RENAME2") \ + EM( FUSE_LSEEK, "FUSE_LSEEK") \ + EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \ + EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \ + EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \ + EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ + EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ + EM( FUSE_STATX, "FUSE_STATX") \ + EMe(CUSE_INIT, "CUSE_INIT") + +/* + * This will turn the above table into TRACE_DEFINE_ENUM() for each of the + * entries. + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +OPCODES + +/* Now we redfine it with the table that __print_symbolic needs. */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(enum fuse_opcode, opcode) + __field(uint32_t, len) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->opcode = req->in.h.opcode; + __entry->len = req->in.h.len; + ), + + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_end, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(uint32_t, len) + __field(int32_t, error) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->len = req->out.h.len; + __entry->error = req->out.h.error; + ), + + TP_printk("connection %u req %llu len %u error %d", __entry->connection, + __entry->unique, __entry->len, __entry->error) +); + +#endif /* _TRACE_FUSE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE fuse_trace +#include From 506b21c945b9716a1e092189c260d9400c52fa14 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 17 May 2024 16:10:28 +0000 Subject: [PATCH 10/32] fuse: use correct name fuse_conn_list in docstring fuse_mount_list doesn't exist, use fuse_conn_list. Signed-off-by: Aurelien Aptel Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 33b21255817e..bdbf9a8f3fc5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -866,7 +866,7 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_mount_list */ + /** Entry on the fuse_conn_list */ struct list_head entry; /** Device ID from the root super block */ From 2097154a10c6ee78be8796411e5d0ad81ee06ed6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:12 +0200 Subject: [PATCH 11/32] namespace: introduce SB_I_NOIDMAP flag Right now we determine if filesystem support vfs idmappings or not basing on the FS_ALLOW_IDMAP flag presence. This "static" way works perfecly well for local filesystems like ext4, xfs, btrfs, etc. But for network-like filesystems like fuse, cephfs this approach is not ideal, because sometimes proper support of vfs idmaps requires some extensions for the on-wire protocol, which implies that changes have to be made not only in the Linux kernel code but also in the 3rd party components like libfuse, cephfs MDS server and so on. We have seen that issue during our work on cephfs idmapped mounts [1] with Christian, but right now I'm working on the idmapped mounts support for fuse/virtiofs and I think that it is a right time for this extension. [1] 5ccd8530dd7 ("ceph: handle idmapped mounts in create_request_message()") Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/namespace.c | 4 ++++ include/linux/fs.h | 1 + 2 files changed, 5 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 328087a4df8a..d1702285c915 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4436,6 +4436,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; + /* The filesystem has turned off idmapped mounts. */ + if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) + return -EINVAL; + /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..6ff547ef21f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1189,6 +1189,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ /* Possible states of 'frozen' field */ enum { From aa16880d9f13c6490e80ad614402c8a6fe6f3efa Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:13 +0200 Subject: [PATCH 12/32] fuse: add basic infrastructure to support idmappings Add some preparational changes in fuse_get_req/fuse_force_creds to handle idmappings. Miklos suggested [1], [2] to change the meaning of in.h.uid/in.h.gid fields when daemon declares support for idmapped mounts. In a new semantic, we fill uid/gid values in fuse header with a id-mapped caller uid/gid (for requests which create new inodes), for all the rest cases we just send -1 to userspace. No functional changes intended. Link: https://lore.kernel.org/all/CAJfpegsVY97_5mHSc06mSw79FehFWtoXT=hhTUK_E-Yhr7OAuQ@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/CAJfpegtHQsEUuFq1k4ZbTD3E1h-GsrN3PWyv7X8cg6sfU_W2Yw@mail.gmail.com/ [2] Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 50 +++++++++++++++++++++++++++++---------- fs/fuse/inode.c | 1 + include/uapi/linux/fuse.h | 2 ++ 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1b6088288a16..b4092b3b4b5a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -108,7 +108,9 @@ static void fuse_drop_waiting(struct fuse_conn *fc) static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) +static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, + struct fuse_mount *fm, + bool for_background) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; @@ -140,19 +142,37 @@ static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) goto out; } - req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); - if (unlikely(req->in.h.uid == ((uid_t)-1) || - req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(req); - return ERR_PTR(-EOVERFLOW); + if ((fm->sb->s_iflags & SB_I_NOIDMAP) || idmap) { + kuid_t idmapped_fsuid; + kgid_t idmapped_fsgid; + + /* + * Note, that when + * (fm->sb->s_iflags & SB_I_NOIDMAP) is true, then + * (idmap == &nop_mnt_idmap) is always true and therefore, + * mapped_fsuid(idmap, fc->user_ns) == current_fsuid(). + */ + idmapped_fsuid = idmap ? mapped_fsuid(idmap, fc->user_ns) : current_fsuid(); + idmapped_fsgid = idmap ? mapped_fsgid(idmap, fc->user_ns) : current_fsgid(); + req->in.h.uid = from_kuid(fc->user_ns, idmapped_fsuid); + req->in.h.gid = from_kgid(fc->user_ns, idmapped_fsgid); + + if (unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(req); + return ERR_PTR(-EOVERFLOW); + } + } else { + req->in.h.uid = FUSE_INVALID_UIDGID; + req->in.h.gid = FUSE_INVALID_UIDGID; } + return req; out: @@ -497,8 +517,14 @@ static void fuse_force_creds(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + if (req->fm->sb->s_iflags & SB_I_NOIDMAP) { + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + } else { + req->in.h.uid = FUSE_INVALID_UIDGID; + req->in.h.gid = FUSE_INVALID_UIDGID; + } + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } @@ -530,7 +556,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, false); + req = fuse_get_req(NULL, fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -591,7 +617,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, true); + req = fuse_get_req(NULL, fm, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -613,7 +639,7 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - req = fuse_get_req(fm, false); + req = fuse_get_req(NULL, fm, false); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index bebd89002328..ed53e173337b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1572,6 +1572,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index d08b99d60f6f..2ccf38181df2 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -984,6 +984,8 @@ struct fuse_fallocate_in { */ #define FUSE_UNIQUE_RESEND (1ULL << 63) +#define FUSE_INVALID_UIDGID ((uint32_t)(-1)) + struct fuse_in_header { uint32_t len; uint32_t opcode; From 10dc721836c0c968990a519e147d4cb7decdae5c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:14 +0200 Subject: [PATCH 13/32] fuse: add an idmap argument to fuse_simple_request If idmap == NULL *and* filesystem daemon declared idmapped mounts support, then uid/gid values in a fuse header will be -1. No functional changes intended. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 4 ++-- fs/fuse/dev.c | 6 ++++-- fs/fuse/dir.c | 26 +++++++++++++------------- fs/fuse/file.c | 32 ++++++++++++++++---------------- fs/fuse/fuse_i.h | 3 ++- fs/fuse/inode.c | 6 +++--- fs/fuse/ioctl.c | 2 +- fs/fuse/readdir.c | 4 ++-- fs/fuse/xattr.c | 8 ++++---- 9 files changed, 47 insertions(+), 44 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..6d8368d66dd4 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -207,7 +207,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err < 0) return err; dmap->writable = writable; @@ -245,7 +245,7 @@ static int fuse_send_removemapping(struct inode *inode, args.in_args[0].value = inargp; args.in_args[1].size = inargp->count * sizeof(*remove_one); args.in_args[1].value = remove_one; - return fuse_simple_request(fm, &args); + return fuse_simple_request(NULL, fm, &args); } static int dmap_removemapping_list(struct inode *inode, unsigned int num, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4092b3b4b5a..5ca2ab48527e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -539,7 +539,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +ssize_t fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; @@ -556,7 +558,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(NULL, fm, false); + req = fuse_get_req(idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8e96df9fd76c..6ce7968365e7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -230,7 +230,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) parent = dget_parent(entry); fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(fm, &args); + ret = fuse_simple_request(NULL, fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) @@ -383,7 +383,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version = fuse_get_attr_version(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -672,7 +672,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (err) goto out_free_ff; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; @@ -803,7 +803,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, goto out_put_forget_req; } - err = fuse_simple_request(fm, args); + err = fuse_simple_request(NULL, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; @@ -987,7 +987,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); @@ -1010,7 +1010,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); @@ -1040,7 +1040,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); @@ -1210,7 +1210,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err) return err; @@ -1268,7 +1268,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { @@ -1472,7 +1472,7 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fm->fc->no_access = 1; err = 0; @@ -1584,7 +1584,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(fm, &ap.args); + res = fuse_simple_request(NULL, fm, &ap.args); fuse_invalidate_atime(inode); @@ -1857,7 +1857,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) } fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(fm, &args); + return fuse_simple_request(NULL, fm, &args); } /* @@ -1970,7 +1970,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fc587f3bc56f..3c18e90fff07 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -48,7 +48,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(fm, &args); + return fuse_simple_request(NULL, fm, &args); } struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) @@ -111,7 +111,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) if (!args) { /* Do nothing when server does not implement 'open' */ } else if (sync) { - fuse_simple_request(ff->fm, args); + fuse_simple_request(NULL, ff->fm, args); fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; @@ -539,7 +539,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -572,7 +572,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(fm, &args); + return fuse_simple_request(NULL, fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -814,7 +814,7 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, if (ia->io->async) return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fm, &ia->ap.args); + return fuse_simple_request(NULL, fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -878,7 +878,7 @@ static int fuse_do_readpage(struct file *file, struct page *page) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(fm, &ia.ap.args); + res = fuse_simple_request(NULL, fm, &ia.ap.args); if (res < 0) return res; /* @@ -976,7 +976,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) if (!err) return; } else { - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(NULL, fm, &ap->args); err = res < 0 ? res : 0; } fuse_readpages_end(fm, &ap->args, err); @@ -1101,7 +1101,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, if (ia->io->async) return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(fm, &ia->ap.args); + err = fuse_simple_request(NULL, fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1147,7 +1147,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; - err = fuse_simple_request(fm, &ap->args); + err = fuse_simple_request(NULL, fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; @@ -2661,7 +2661,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); @@ -2685,7 +2685,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2759,7 +2759,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) fm->fc->no_bmap = 1; @@ -2791,7 +2791,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err) { if (err == -ENOSYS) { fm->fc->no_lseek = 1; @@ -2924,7 +2924,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) return demangle_poll(outarg.revents); @@ -3146,7 +3146,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; @@ -3258,7 +3258,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bdbf9a8f3fc5..407ee2542ff5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1147,7 +1147,8 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +ssize_t fuse_simple_request(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args); int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ed53e173337b..b08bc6081612 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -586,7 +586,7 @@ static void fuse_send_destroy(struct fuse_mount *fm) args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(fm, &args); + fuse_simple_request(NULL, fm, &args); } } @@ -624,7 +624,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; @@ -713,7 +713,7 @@ static int fuse_sync_fs(struct super_block *sb, int wait) args.nodeid = get_node_id(sb->s_root->d_inode); args.out_numargs = 0; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fc->sync_fs = 0; err = 0; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 572ce8a82ceb..b40dd931167d 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -18,7 +18,7 @@ static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].size = sizeof(*outarg); args->out_args[0].value = outarg; - ret = fuse_simple_request(fm, args); + ret = fuse_simple_request(NULL, fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c8..e8a093289421 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -279,7 +279,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(fm, &args); + fuse_simple_request(NULL, fm, &args); /* ignore errors */ } @@ -358,7 +358,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(NULL, fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 9f568d345c51..0a9b60de3668 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -37,7 +37,7 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; @@ -79,7 +79,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fm, &args); + ret = fuse_simple_request(NULL, fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { @@ -141,7 +141,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fm, &args); + ret = fuse_simple_request(NULL, fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) @@ -167,7 +167,7 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_request(NULL, fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; From d561254fb7ba451f580becb21ab3472bc0265080 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:15 +0200 Subject: [PATCH 14/32] fuse: support idmapped FUSE_EXT_GROUPS We don't need to remap parent_gid, but have to adjust group membership checks and take idmapping into account. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6ce7968365e7..8a936dc0072b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -545,17 +545,21 @@ static u32 fuse_ext_size(size_t size) /* * This adds just a single supplementary group that matches the parent's group. */ -static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) +static int get_create_supp_group(struct mnt_idmap *idmap, + struct inode *dir, + struct fuse_in_arg *ext) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_ext_header *xh; struct fuse_supp_groups *sg; kgid_t kgid = dir->i_gid; + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid); gid_t parent_gid = from_kgid(fc->user_ns, kgid); + u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); - if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) || - !in_group_p(kgid)) + if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) || + !vfsgid_in_group_p(vfsgid)) return 0; xh = extend_arg(ext, sg_len); @@ -572,7 +576,8 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) return 0; } -static int get_create_ext(struct fuse_args *args, +static int get_create_ext(struct mnt_idmap *idmap, + struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -583,7 +588,7 @@ static int get_create_ext(struct fuse_args *args, if (fc->init_security) err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) - err = get_create_supp_group(dir, &ext); + err = get_create_supp_group(idmap, dir, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); @@ -668,7 +673,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; - err = get_create_ext(&args, dir, entry, mode); + err = get_create_ext(&nop_mnt_idmap, &args, dir, entry, mode); if (err) goto out_free_ff; @@ -798,7 +803,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(args, dir, entry, mode); + err = get_create_ext(&nop_mnt_idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } From 556208e139e19e0b308e104c6b0e42a9e0a54c3a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:16 +0200 Subject: [PATCH 15/32] fuse: support idmap for mkdir/mknod/symlink/create/tmpfile We have all the infrastructure in place, we just need to pass an idmapping here. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8a936dc0072b..9da8feede181 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -614,9 +614,9 @@ static void free_ext_value(struct fuse_args *args) * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned int flags, - umode_t mode, u32 opcode) +static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, struct file *file, + unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -673,11 +673,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; - err = get_create_ext(&nop_mnt_idmap, &args, dir, entry, mode); + err = get_create_ext(idmap, &args, dir, entry, mode); if (err) goto out_free_ff; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; @@ -734,6 +734,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, umode_t mode) { int err; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; @@ -758,7 +759,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); + err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -769,7 +770,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, return err; mknod: - err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -779,9 +780,9 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, - struct inode *dir, struct dentry *entry, - umode_t mode) +static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -803,12 +804,12 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(&nop_mnt_idmap, args, dir, entry, mode); + err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } - err = fuse_simple_request(NULL, fm, args); + err = fuse_simple_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; @@ -869,13 +870,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, mode); + return create_new_entry(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, @@ -887,7 +888,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (fc->no_tmpfile) return -EOPNOTSUPP; - err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + err = fuse_create_open(idmap, dir, file->f_path.dentry, file, + file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; @@ -914,7 +916,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, S_IFDIR); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, @@ -930,7 +932,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fm, &args, dir, entry, S_IFLNK); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1124,7 +1126,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + err = create_new_entry(NULL, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) From 2a8c810d5e492906fe0af94a6e8b2398714e80e0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:17 +0200 Subject: [PATCH 16/32] fuse: support idmapped getattr inode op We have to: - pass an idmapping to the generic_fillattr() to properly handle UIG/GID mapping for the userspace. - pass -/- to fuse_fillattr() (analog of generic_fillattr() in fuse). Difference between these two is that generic_fillattr() takes all the stat() data from the inode directly, while fuse_fillattr() codepath takes a fresh data just from the userspace reply on the FUSE_GETATTR request. In some cases we can just pass &nop_mnt_idmap, because idmapping won't be used in these codepaths. For example, when 3rd argument of fuse_do_getattr() is NULL then idmap argument is not used. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9da8feede181..2bc3ae40619a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1135,18 +1135,22 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, return err; } -static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, - struct kstat *stat) +static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, + struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); + vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, + make_kuid(fc->user_ns, attr->uid)); + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, + make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(fc->user_ns, attr->uid); - stat->gid = make_kgid(fc->user_ns, attr->gid); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1185,8 +1189,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) attr->blksize = sx->blksize; } -static int fuse_do_statx(struct inode *inode, struct file *file, - struct kstat *stat) +static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; @@ -1239,15 +1243,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file, stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - fuse_fillattr(inode, &attr, stat); + fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } -static int fuse_do_getattr(struct inode *inode, struct kstat *stat, - struct file *file) +static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, + struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; @@ -1286,15 +1290,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, ATTR_TIMEOUT(&outarg), attr_version); if (stat) - fuse_fillattr(inode, &outarg.attr, stat); + fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } -static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, u32 request_mask, - unsigned int flags) +static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1325,17 +1329,17 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { - err = fuse_do_statx(inode, file, stat); + err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; err = 0; goto retry; } } else { - err = fuse_do_getattr(inode, stat, file); + err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { @@ -1349,7 +1353,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - return fuse_update_get_attr(inode, file, NULL, mask, 0); + return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, @@ -1493,7 +1497,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) return -ECHILD; forget_all_cached_acls(inode); - return fuse_do_getattr(inode, NULL, NULL); + return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* @@ -2072,7 +2076,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; @@ -2129,7 +2133,7 @@ static int fuse_getattr(struct mnt_idmap *idmap, return -EACCES; } - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { From c1d82215d391ded456e7bdd1c0893c9cb6322272 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:18 +0200 Subject: [PATCH 17/32] fuse: support idmapped ->permission inode op We only cover the case when "default_permissions" flag is used. A reason for that is that otherwise all the permission checks are done in the userspace and we have to deal with VFS idmapping in the userspace (which is bad), alternatively we have to provide the userspace with idmapped req->in.h.uid/req->in.h.gid which is also not align with VFS idmaps philosophy. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2bc3ae40619a..70ed1c47d7ce 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1545,7 +1545,7 @@ static int fuse_permission(struct mnt_idmap *idmap, } if (fc->default_permissions) { - err = generic_permission(&nop_mnt_idmap, inode, mask); + err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1553,7 +1553,7 @@ static int fuse_permission(struct mnt_idmap *idmap, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&nop_mnt_idmap, + err = generic_permission(idmap, inode, mask); } From 276a02569920f793047479fe7ef5da2c3e285a80 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:19 +0200 Subject: [PATCH 18/32] fuse: support idmapped ->setattr op Need to translate uid and gid in case of chown(2). Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 34 +++++++++++++++++++++++----------- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 4 ++-- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 70ed1c47d7ce..5fa9c99f4d70 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1749,17 +1749,29 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, - struct fuse_setattr_in *arg, bool trust_local_cmtime) +static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, + struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; - if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); - if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + + if (ivalid & ATTR_UID) { + kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); + + arg->valid |= FATTR_UID; + arg->uid = from_kuid(fc->user_ns, fsuid); + } + + if (ivalid & ATTR_GID) { + kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); + + arg->valid |= FATTR_GID; + arg->gid = from_kgid(fc->user_ns, fsgid); + } + if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1879,8 +1891,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); @@ -1900,7 +1912,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -1959,7 +1971,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); + iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -2094,7 +2106,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, if (!attr->ia_valid) return 0; - ret = fuse_do_setattr(entry, attr, file); + ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3c18e90fff07..0c33ddf97864 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2971,7 +2971,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 407ee2542ff5..7dd0661aeab6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1324,8 +1324,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file); +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); From 4d833befa20253049e4f48ece9cb82f12800405d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:20 +0200 Subject: [PATCH 19/32] fuse: drop idmap argument from __fuse_get_acl We don't need to have idmap in the __fuse_get_acl as we don't have any use for it. In the current POSIX ACL implementation, idmapped mounts are taken into account on the userspace/kernel border (see vfs_set_acl_idmapped_mnt() and vfs_posix_acl_to_xattr()). Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 04cfd8fee992..897d813c5e92 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -12,7 +12,6 @@ #include static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, - struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu) { int size; @@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); - return __fuse_get_acl(fc, idmap, inode, type, false); + return __fuse_get_acl(fc, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) @@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) */ if (!fc->posix_acl) return NULL; - - return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); + return __fuse_get_acl(fc, inode, type, rcu); } int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, From d395d0a5d2544b639cd7b0055e4de85b0efc2762 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:21 +0200 Subject: [PATCH 20/32] fuse: support idmapped ->set_acl It's just a matter of adjusting a permission check condition for S_ISGID flag. All the rest is already handled in the generic VFS code. Notice that this permission check is the analog of what we have in posix_acl_update_mode() generic helper, but fuse doesn't use this helper as on the kernel side we don't care about ensuring that POSIX ACL and CHMOD permissions are in sync as it is a responsibility of a userspace daemon to handle that. For the same reason we don't have a calls to posix_acl_chmod(), while most of other filesystem do. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 897d813c5e92..8f484b105f13 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -144,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !in_group_or_capable(&nop_mnt_idmap, inode, - i_gid_into_vfsgid(&nop_mnt_idmap, inode))) + !in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); From 4be75ffe721cbf925478ebfbff872b02833899b7 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:22 +0200 Subject: [PATCH 21/32] fuse: support idmapped ->rename op RENAME_WHITEOUT is a special case of ->rename and we need to take idmappings into account there. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5fa9c99f4d70..6f289fa9cc62 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1026,7 +1026,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, +static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { @@ -1047,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); @@ -1093,7 +1093,8 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, if (fc->no_rename2 || fc->minor < 23) return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : NULL, + olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { @@ -1101,7 +1102,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, err = -EINVAL; } } else { - err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + err = fuse_rename_common(NULL, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } From 5b8ca5a54cb89ab07b0389f50e038e533cdfdd86 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:23 +0200 Subject: [PATCH 22/32] fuse: handle idmappings properly in ->write_iter() This is needed to properly clear suid/sgid. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0c33ddf97864..ca553d7a7c9e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1398,6 +1398,7 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; @@ -1412,7 +1413,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&nop_mnt_idmap, + setattr_should_drop_suidgid(idmap, file_inode(file))) { goto writethrough; } From 6d14b18596ca69719cfe1af87dbf3c5e763d29b5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:24 +0200 Subject: [PATCH 23/32] fuse: warn if fuse_access is called when idmapped mounts are allowed It is not possible with the current fuse code, but let's protect ourselves from regressions in the future. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6f289fa9cc62..99f9948bf68b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1474,6 +1474,14 @@ static int fuse_access(struct inode *inode, int mask) BUG_ON(mask & MAY_NOT_BLOCK); + /* + * We should not send FUSE_ACCESS to the userspace + * when idmapped mounts are enabled as for this case + * we have fc->default_permissions = 1 and access + * permission checks are done on the kernel side. + */ + WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP)); + if (fm->fc->no_access) return 0; From 16e1503eaf329129170e4e7a078aee17686967a5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:25 +0200 Subject: [PATCH 24/32] fuse: allow idmapped mounts Now we have everything in place and we can allow idmapped mounts by setting the FS_ALLOW_IDMAP flag. Notice that real availability of idmapped mounts will depend on the fuse daemon. Fuse daemon have to set FUSE_ALLOW_IDMAP flag in the FUSE_INIT reply. To discuss: - we enable idmapped mounts support only if "default_permissions" mode is enabled, because otherwise we would need to deal with UID/GID mappings in the userspace side OR provide the userspace with idmapped req->in.h.uid/req->in.h.gid values which is not something that we probably want to. Idmapped mounts philosophy is not about faking caller uid/gid. Some extra links and examples: - libfuse support https://github.com/mihalicyn/libfuse/commits/idmap_support - fuse-overlayfs support: https://github.com/mihalicyn/fuse-overlayfs/commits/idmap_support - cephfs-fuse conversion example https://github.com/mihalicyn/ceph/commits/fuse_idmap - glusterfs conversion example https://github.com/mihalicyn/glusterfs/commits/fuse_idmap Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 12 +++++++++--- include/uapi/linux/fuse.h | 20 +++++++++++++++++++- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b08bc6081612..d7edb3fb829f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1348,6 +1348,12 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_NO_EXPORT_SUPPORT) fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_ALLOW_IDMAP) { + if (fc->default_permissions) + fm->sb->s_iflags &= ~SB_I_NOIDMAP; + else + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1395,7 +1401,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1985,7 +1991,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, @@ -2006,7 +2012,7 @@ static struct file_system_type fuseblk_fs_type = { .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, - .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 2ccf38181df2..f1e99458e29e 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -217,6 +217,9 @@ * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag * - add FUSE_NO_EXPORT_SUPPORT init flag * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag + * + * 7.41 + * - add FUSE_ALLOW_IDMAP */ #ifndef _LINUX_FUSE_H @@ -252,7 +255,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 40 +#define FUSE_KERNEL_MINOR_VERSION 41 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -421,6 +424,7 @@ struct fuse_file_lock { * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit * of the request ID indicates resend requests + * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -466,6 +470,7 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP +#define FUSE_ALLOW_IDMAP (1ULL << 40) /** * CUSE INIT request/reply flags @@ -984,6 +989,19 @@ struct fuse_fallocate_in { */ #define FUSE_UNIQUE_RESEND (1ULL << 63) +/** + * This value will be set by the kernel to + * (struct fuse_in_header).{uid,gid} fields in + * case when: + * - fuse daemon enabled FUSE_ALLOW_IDMAP + * - idmapping information is not available and uid/gid + * can not be mapped in accordance with an idmapping. + * + * Note: an idmapping information always available + * for inode creation operations like: + * FUSE_MKNOD, FUSE_SYMLINK, FUSE_MKDIR, FUSE_TMPFILE, + * FUSE_CREATE and FUSE_RENAME2 (with RENAME_WHITEOUT). + */ #define FUSE_INVALID_UIDGID ((uint32_t)(-1)) struct fuse_in_header { From 862b9a8eb900d347af5be0eb1aeef9b161a83e77 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:26 +0200 Subject: [PATCH 25/32] virtio_fs: allow idmapped mounts Allow idmapped mounts for virtiofs. It's absolutely safe as for virtiofs we have the same feature negotiation mechanism as for classical fuse filesystems. This does not affect any existing setups anyhow. virtiofsd support: https://gitlab.com/virtio-fs/virtiofsd/-/merge_requests/245 Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 2fee9eb5ad0b..b6bd6ab71f5d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1613,6 +1613,7 @@ static struct file_system_type virtio_fs_type = { .name = "virtiofs", .init_fs_context = virtio_fs_init_fs_context, .kill_sb = virtio_kill_sb, + .fs_flags = FS_ALLOW_IDMAP, }; static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) From efad7153bf93db8565128f7567aab1d23e221098 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 13 Sep 2024 12:47:01 +0200 Subject: [PATCH 26/32] fuse: allow O_PATH fd for FUSE_DEV_IOC_BACKING_OPEN Only f_path is used from backing files registered with FUSE_DEV_IOC_BACKING_OPEN, so it makes sense to allow O_PATH descriptors. O_PATH files have an empty f_op, so don't check read_iter/write_iter. Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/passthrough.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 9666d13884ce..62aee8289d11 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -228,16 +228,13 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) if (map->flags || map->padding) goto out; - file = fget(map->fd); + file = fget_raw(map->fd); res = -EBADF; if (!file) goto out; - res = -EOPNOTSUPP; - if (!file->f_op->read_iter || !file->f_op->write_iter) - goto out_fput; - backing_sb = file_inode(file)->i_sb; + pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth); res = -ELOOP; if (backing_sb->s_stack_depth >= fc->max_stack_depth) goto out_fput; From 3988a60d3aaabd6cca64fbd8f7be65c0c878d87b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 12 Sep 2024 16:58:24 +0200 Subject: [PATCH 27/32] fs/fuse: fix null-ptr-deref when checking SB_I_NOIDMAP flag It was reported [1] that on linux-next/fs-next the following crash is reproducible: [ 42.659136] Oops: general protection fault, probably for non-canonical address 0xdffffc000000000b: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 42.660501] fbcon: Taking over console [ 42.660930] KASAN: null-ptr-deref in range [0x0000000000000058-0x000000000000005f] [ 42.661752] CPU: 1 UID: 0 PID: 1589 Comm: dtprobed Not tainted 6.11.0-rc6+ #1 [ 42.662565] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.6.6 08/22/2023 [ 42.663472] RIP: 0010:fuse_get_req+0x36b/0x990 [fuse] [ 42.664046] Code: 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 8c 05 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d 58 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 4d 05 00 00 f6 45 59 20 0f 85 06 03 00 00 48 83 [ 42.666945] RSP: 0018:ffffc900009a7730 EFLAGS: 00010212 [ 42.668837] RAX: dffffc0000000000 RBX: 1ffff92000134eed RCX: ffffffffc20dec9a [ 42.670122] RDX: 000000000000000b RSI: 0000000000000008 RDI: 0000000000000058 [ 42.672154] RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed1022110172 [ 42.672160] R10: ffff888110880b97 R11: ffffc900009a737a R12: 0000000000000001 [ 42.672179] R13: ffff888110880b60 R14: ffff888110880b90 R15: ffff888169973840 [ 42.672186] FS: 00007f28cd21d7c0(0000) GS:ffff8883ef280000(0000) knlGS:0000000000000000 [ 42.672191] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 42.[ CR02: ;32m00007f3237366208 CR3: 0 OK 79e001 CR4: 0000000000770ef0 [ 42.672214] PKRU: 55555554 [ 42.672218] Call Trace: [ 42.672223] [ 42.672226] ? die_addr+0x41/0xa0 [ 42.672238] ? exc_general_protection+0x14c/0x230 [ 42.672250] ? asm_exc_general_protection+0x26/0x30 [ 42.672260] ? fuse_get_req+0x77a/0x990 [fuse] [ 42.672281] ? fuse_get_req+0x36b/0x990 [fuse] [ 42.672300] ? kasan_unpoison+0x27/0x60 [ 42.672310] ? __pfx_fuse_get_req+0x10/0x10 [fuse] [ 42.672327] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672333] ? alloc_pages_mpol_noprof+0x195/0x440 [ 42.672340] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672345] ? kasan_unpoison+0x27/0x60 [ 42.672350] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672355] ? __kasan_slab_alloc+0x4d/0x90 [ 42.672362] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672367] ? __kmalloc_cache_noprof+0x134/0x350 [ 42.672376] fuse_simple_background+0xe7/0x180 [fuse] [ 42.672406] cuse_channel_open+0x540/0x710 [cuse] [ 42.672415] misc_open+0x2a7/0x3a0 [ 42.672424] chrdev_open+0x1ef/0x5f0 [ 42.672432] ? __pfx_chrdev_open+0x10/0x10 [ 42.672439] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672443] ? security_file_open+0x3bb/0x720 [ 42.672451] do_dentry_open+0x43d/0x1200 [ 42.672459] ? __pfx_chrdev_open+0x10/0x10 [ 42.672468] vfs_open+0x79/0x340 [ 42.672475] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672482] do_open+0x68c/0x11e0 [ 42.672489] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672495] ? __pfx_do_open+0x10/0x10 [ 42.672501] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.672506] ? open_last_lookups+0x2a2/0x1370 [ 42.672515] path_openat+0x24f/0x640 [ 42.672522] ? __pfx_path_openat+0x10/0x10 [ 42.723972] ? stack_depot_save_flags+0x45/0x4b0 [ 42.724787] ? __fput+0x43c/0xa70 [ 42.725100] do_filp_open+0x1b3/0x3e0 [ 42.725710] ? poison_slab_object+0x10d/0x190 [ 42.726145] ? __kasan_slab_free+0x33/0x50 [ 42.726570] ? __pfx_do_filp_open+0x10/0x10 [ 42.726981] ? do_syscall_64+0x64/0x170 [ 42.727418] ? entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 42.728018] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.728505] ? do_raw_spin_lock+0x131/0x270 [ 42.728922] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 42.729494] ? do_raw_spin_unlock+0x14c/0x1f0 [ 42.729992] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.730889] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.732178] ? alloc_fd+0x176/0x5e0 [ 42.732585] do_sys_openat2+0x122/0x160 [ 42.732929] ? __pfx_do_sys_openat2+0x10/0x10 [ 42.733448] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.734013] ? __pfx_map_id_up+0x10/0x10 [ 42.734482] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.735529] ? __memcg_slab_free_hook+0x292/0x500 [ 42.736131] __x64_sys_openat+0x123/0x1e0 [ 42.736526] ? __pfx___x64_sys_openat+0x10/0x10 [ 42.737369] ? __x64_sys_close+0x7c/0xd0 [ 42.737717] ? srso_alias_return_thunk+0x5/0xfbef5 [ 42.738192] ? syscall_trace_enter+0x11e/0x1b0 [ 42.738739] do_syscall_64+0x64/0x170 [ 42.739113] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 42.739638] RIP: 0033:0x7f28cd13e87b [ 42.740038] Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 54 24 28 64 48 2b 14 25 [ 42.741943] RSP: 002b:00007ffc992546c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 [ 42.742951] RAX: ffffffffffffffda RBX: 00007f28cd44f1ee RCX: 00007f28cd13e87b [ 42.743660] RDX: 0000000000000002 RSI: 00007f28cd44f2fa RDI: 00000000ffffff9c [ 42.744518] RBP: 00007f28cd44f2fa R08: 0000000000000000 R09: 0000000000000001 [ 42.745211] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 [ 42.745920] R13: 00007f28cd44f2fa R14: 0000000000000000 R15: 0000000000000003 [ 42.746708] [ 42.746937] Modules linked in: cuse vfat fat ext4 mbcache jbd2 intel_rapl_msr intel_rapl_common kvm_amd ccp bochs drm_vram_helper kvm drm_ttm_helper ttm pcspkr i2c_piix4 drm_kms_helper i2c_smbus pvpanic_mmio pvpanic joydev sch_fq_codel drm fuse xfs nvme_tcp nvme_fabrics nvme_core sd_mod sg virtio_net net_failover virtio_scsi failover crct10dif_pclmul crc32_pclmul ata_generic pata_acpi ata_piix ghash_clmulni_intel virtio_pci sha512_ssse3 virtio_pci_legacy_dev sha256_ssse3 virtio_pci_modern_dev sha1_ssse3 libata serio_raw dm_multipath btrfs blake2b_generic xor zstd_compress raid6_pq sunrpc dm_mirror dm_region_hash dm_log dm_mod be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls cxgb3i cxgb3 mdio libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi qemu_fw_cfg aesni_intel crypto_simd cryptd [ 42.754333] ---[ end trace 0000000000000000 ]--- [ 42.756899] RIP: 0010:fuse_get_req+0x36b/0x990 [fuse] [ 42.757851] Code: 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 8c 05 00 00 48 b8 00 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d 58 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 4d 05 00 00 f6 45 59 20 0f 85 06 03 00 00 48 83 [ 42.760334] RSP: 0018:ffffc900009a7730 EFLAGS: 00010212 [ 42.760940] RAX: dffffc0000000000 RBX: 1ffff92000134eed RCX: ffffffffc20dec9a [ 42.761697] RDX: 000000000000000b RSI: 0000000000000008 RDI: 0000000000000058 [ 42.763009] RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed1022110172 [ 42.763920] R10: ffff888110880b97 R11: ffffc900009a737a R12: 0000000000000001 [ 42.764839] R13: ffff888110880b60 R14: ffff888110880b90 R15: ffff888169973840 [ 42.765716] FS: 00007f28cd21d7c0(0000) GS:ffff8883ef280000(0000) knlGS:0000000000000000 [ 42.766890] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 42.767828] CR2: 00007f3237366208 CR3: 000000012c79e001 CR4: 0000000000770ef0 [ 42.768730] PKRU: 55555554 [ 42.769022] Kernel panic - not syncing: Fatal exception [ 42.770758] Kernel Offset: 0x7200000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 42.771947] ---[ end Kernel panic - not syncing: Fatal exception ]--- It's obviously CUSE related callstack. For CUSE case, we don't have superblock and our checks for SB_I_NOIDMAP flag does not make any sense. Let's handle this case gracefully. Fixes: aa16880d9f13 ("fuse: add basic infrastructure to support idmappings") Link: https://lore.kernel.org/linux-next/87v7z586py.fsf@debian-BULLSEYE-live-builder-AMD64/ [1] Reported-by: Chandan Babu R Reported-by: syzbot+20c7e20cc8f5296dca12@syzkaller.appspotmail.com Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5ca2ab48527e..3e7912c65bc6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -148,7 +148,7 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (for_background) __set_bit(FR_BACKGROUND, &req->flags); - if ((fm->sb->s_iflags & SB_I_NOIDMAP) || idmap) { + if (!fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP) || idmap) { kuid_t idmapped_fsuid; kgid_t idmapped_fsgid; @@ -517,7 +517,7 @@ static void fuse_force_creds(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; - if (req->fm->sb->s_iflags & SB_I_NOIDMAP) { + if (!req->fm->sb || req->fm->sb->s_iflags & SB_I_NOIDMAP) { req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); } else { From 0c6793823d2b0eb079f4c6f54d9cdf6b2beec9d8 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 6 Sep 2024 16:34:51 +0200 Subject: [PATCH 28/32] fs/fuse: introduce and use fuse_simple_idmap_request() helper Let's convert all existing callers properly. No functional changes intended. Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 4 ++-- fs/fuse/dev.c | 6 +++--- fs/fuse/dir.c | 26 +++++++++++++------------- fs/fuse/file.c | 32 ++++++++++++++++---------------- fs/fuse/fuse_i.h | 18 ++++++++++++++++-- fs/fuse/inode.c | 6 +++--- fs/fuse/ioctl.c | 2 +- fs/fuse/readdir.c | 4 ++-- fs/fuse/xattr.c | 8 ++++---- 9 files changed, 60 insertions(+), 46 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 6d8368d66dd4..12ef91d170bb 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -207,7 +207,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err < 0) return err; dmap->writable = writable; @@ -245,7 +245,7 @@ static int fuse_send_removemapping(struct inode *inode, args.in_args[0].value = inargp; args.in_args[1].size = inargp->count * sizeof(*remove_one); args.in_args[1].value = remove_one; - return fuse_simple_request(NULL, fm, &args); + return fuse_simple_request(fm, &args); } static int dmap_removemapping_list(struct inode *inode, unsigned int num, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3e7912c65bc6..317d2b30b21f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -539,9 +539,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct mnt_idmap *idmap, - struct fuse_mount *fm, - struct fuse_args *args) +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 99f9948bf68b..491e112819be 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -230,7 +230,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) parent = dget_parent(entry); fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(NULL, fm, &args); + ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) @@ -383,7 +383,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version = fuse_get_attr_version(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -677,7 +677,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, if (err) goto out_free_ff; - err = fuse_simple_request(idmap, fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; @@ -809,7 +809,7 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, goto out_put_forget_req; } - err = fuse_simple_request(idmap, fm, args); + err = fuse_simple_idmap_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; @@ -994,7 +994,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); @@ -1017,7 +1017,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); @@ -1047,7 +1047,7 @@ static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, str args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(idmap, fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); @@ -1222,7 +1222,7 @@ static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err) return err; @@ -1280,7 +1280,7 @@ static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { @@ -1492,7 +1492,7 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_access = 1; err = 0; @@ -1604,7 +1604,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(NULL, fm, &ap.args); + res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); @@ -1889,7 +1889,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) } fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(NULL, fm, &args); + return fuse_simple_request(fm, &args); } /* @@ -2002,7 +2002,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ca553d7a7c9e..b8afeca12487 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -48,7 +48,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(NULL, fm, &args); + return fuse_simple_request(fm, &args); } struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) @@ -111,7 +111,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) if (!args) { /* Do nothing when server does not implement 'open' */ } else if (sync) { - fuse_simple_request(NULL, ff->fm, args); + fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; @@ -539,7 +539,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_flush = 1; err = 0; @@ -572,7 +572,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(NULL, fm, &args); + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -814,7 +814,7 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, if (ia->io->async) return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(NULL, fm, &ia->ap.args); + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -878,7 +878,7 @@ static int fuse_do_readpage(struct file *file, struct page *page) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(NULL, fm, &ia.ap.args); + res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* @@ -976,7 +976,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) if (!err) return; } else { - res = fuse_simple_request(NULL, fm, &ap->args); + res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } fuse_readpages_end(fm, &ap->args, err); @@ -1101,7 +1101,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, if (ia->io->async) return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(NULL, fm, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1147,7 +1147,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; - err = fuse_simple_request(NULL, fm, &ap->args); + err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; @@ -2662,7 +2662,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); @@ -2686,7 +2686,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2760,7 +2760,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) fm->fc->no_bmap = 1; @@ -2792,7 +2792,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { fm->fc->no_lseek = 1; @@ -2925,7 +2925,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); @@ -3147,7 +3147,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; @@ -3259,7 +3259,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7dd0661aeab6..b2c7834f21b5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1147,8 +1147,22 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct mnt_idmap *idmap, struct fuse_mount *fm, - struct fuse_args *args); +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args); + +static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +{ + return __fuse_simple_request(NULL, fm, args); +} + +static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) +{ + return __fuse_simple_request(idmap, fm, args); +} + int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d7edb3fb829f..fd3321e29a3e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -586,7 +586,7 @@ static void fuse_send_destroy(struct fuse_mount *fm) args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(NULL, fm, &args); + fuse_simple_request(fm, &args); } } @@ -624,7 +624,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; @@ -713,7 +713,7 @@ static int fuse_sync_fs(struct super_block *sb, int wait) args.nodeid = get_node_id(sb->s_root->d_inode); args.out_numargs = 0; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->sync_fs = 0; err = 0; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index b40dd931167d..572ce8a82ceb 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -18,7 +18,7 @@ static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].size = sizeof(*outarg); args->out_args[0].value = outarg; - ret = fuse_simple_request(NULL, fm, args); + ret = fuse_simple_request(fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e8a093289421..0377b6dc24c8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -279,7 +279,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(NULL, fm, &args); + fuse_simple_request(fm, &args); /* ignore errors */ } @@ -358,7 +358,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(NULL, fm, &ap->args); + res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 0a9b60de3668..9f568d345c51 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -37,7 +37,7 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; @@ -79,7 +79,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(NULL, fm, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { @@ -141,7 +141,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(NULL, fm, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(size_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) @@ -167,7 +167,7 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(NULL, fm, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; From ffcdc4c628e1a30489da10dd78358e89c823b341 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 6 Sep 2024 16:34:52 +0200 Subject: [PATCH 29/32] fs/mnt_idmapping: introduce an invalid_mnt_idmap Link: https://lore.kernel.org/linux-fsdevel/20240904-baugrube-erhoben-b3c1c49a2645@brauner/ Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/mnt_idmapping.c | 22 ++++++++++++++++++++-- include/linux/mnt_idmapping.h | 1 + 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 3c60f1eaca61..cbca6500848e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -32,6 +32,15 @@ struct mnt_idmap nop_mnt_idmap = { }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); +/* + * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. + * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. + */ +struct mnt_idmap invalid_mnt_idmap = { + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(invalid_mnt_idmap); + /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check @@ -75,6 +84,8 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); + if (idmap == &invalid_mnt_idmap) + return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else @@ -112,6 +123,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); + if (idmap == &invalid_mnt_idmap) + return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else @@ -140,6 +153,8 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); + if (idmap == &invalid_mnt_idmap) + return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; @@ -167,6 +182,8 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); + if (idmap == &invalid_mnt_idmap) + return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; @@ -296,7 +313,7 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap) + if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; @@ -312,7 +329,8 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && + refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index cd4d5c8781f5..b1b219bc3422 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -9,6 +9,7 @@ struct mnt_idmap; struct user_namespace; extern struct mnt_idmap nop_mnt_idmap; +extern struct mnt_idmap invalid_mnt_idmap; extern struct user_namespace init_user_ns; typedef struct { From 106e4593ed1b9925ca732a74f490e4f52ea4e65c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 6 Sep 2024 16:34:53 +0200 Subject: [PATCH 30/32] fs/fuse: convert to use invalid_mnt_idmap We should convert fs/fuse code to use a newly introduced invalid_mnt_idmap instead of passing a NULL as idmap pointer. Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 47 +++++++++++++++++++++++------------------------ fs/fuse/dir.c | 6 +++--- fs/fuse/fuse_i.h | 2 +- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 317d2b30b21f..cdf925e0b317 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -114,7 +114,11 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, { struct fuse_conn *fc = fm->fc; struct fuse_req *req; + bool no_idmap = !fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP); + kuid_t fsuid; + kgid_t fsgid; int err; + atomic_inc(&fc->num_waiting); if (fuse_block_alloc(fc, for_background)) { @@ -148,29 +152,24 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (for_background) __set_bit(FR_BACKGROUND, &req->flags); - if (!fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP) || idmap) { - kuid_t idmapped_fsuid; - kgid_t idmapped_fsgid; + /* + * Keep the old behavior when idmappings support was not + * declared by a FUSE server. + * + * For those FUSE servers who support idmapped mounts, + * we send UID/GID only along with "inode creation" + * fuse requests, otherwise idmap == &invalid_mnt_idmap and + * req->in.h.{u,g}id will be equal to FUSE_INVALID_UIDGID. + */ + fsuid = no_idmap ? current_fsuid() : mapped_fsuid(idmap, fc->user_ns); + fsgid = no_idmap ? current_fsgid() : mapped_fsgid(idmap, fc->user_ns); + req->in.h.uid = from_kuid(fc->user_ns, fsuid); + req->in.h.gid = from_kgid(fc->user_ns, fsgid); - /* - * Note, that when - * (fm->sb->s_iflags & SB_I_NOIDMAP) is true, then - * (idmap == &nop_mnt_idmap) is always true and therefore, - * mapped_fsuid(idmap, fc->user_ns) == current_fsuid(). - */ - idmapped_fsuid = idmap ? mapped_fsuid(idmap, fc->user_ns) : current_fsuid(); - idmapped_fsgid = idmap ? mapped_fsgid(idmap, fc->user_ns) : current_fsgid(); - req->in.h.uid = from_kuid(fc->user_ns, idmapped_fsuid); - req->in.h.gid = from_kgid(fc->user_ns, idmapped_fsgid); - - if (unlikely(req->in.h.uid == ((uid_t)-1) || - req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(req); - return ERR_PTR(-EOVERFLOW); - } - } else { - req->in.h.uid = FUSE_INVALID_UIDGID; - req->in.h.gid = FUSE_INVALID_UIDGID; + if (no_idmap && unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(req); + return ERR_PTR(-EOVERFLOW); } return req; @@ -619,7 +618,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(NULL, fm, true); + req = fuse_get_req(&invalid_mnt_idmap, fm, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -641,7 +640,7 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - req = fuse_get_req(NULL, fm, false); + req = fuse_get_req(&invalid_mnt_idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 491e112819be..54104dd48af7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1093,7 +1093,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, if (fc->no_rename2 || fc->minor < 23) return -EINVAL; - err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : NULL, + err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap, olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); @@ -1102,7 +1102,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, err = -EINVAL; } } else { - err = fuse_rename_common(NULL, olddir, oldent, newdir, newent, 0, + err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } @@ -1127,7 +1127,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(NULL, fm, &args, newdir, newent, inode->i_mode); + err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b2c7834f21b5..e6cc3d552b13 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1153,7 +1153,7 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { - return __fuse_simple_request(NULL, fm, args); + return __fuse_simple_request(&invalid_mnt_idmap, fm, args); } static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, From fcd2d9e1fdcd7cada612f2e8737fb13a2bce7d0e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Sep 2024 10:47:23 +0200 Subject: [PATCH 31/32] fuse: clear FR_PENDING if abort is detected when sending request The (!fiq->connected) check was moved into the queuing method resulting in the following: Fixes: 5de8acb41c86 ("fuse: cleanup request queuing towards virtiofs") Reported-by: Lai, Yi Closes: https://lore.kernel.org/all/ZvFEAM6JfrBKsOU0@ly-workstation/ Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cdf925e0b317..53c4569d85a4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -295,6 +295,7 @@ static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) } else { spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; + clear_bit(FR_PENDING, &req->flags); fuse_request_end(req); } } From 2f3d8ff457982f4055fe8f7bf19d3821ba22c376 Mon Sep 17 00:00:00 2001 From: yangyun Date: Sat, 14 Sep 2024 16:51:31 +0800 Subject: [PATCH 32/32] fuse: use exclusive lock when FUSE_I_CACHE_IO_MODE is set This may be a typo. The comment has said shared locks are not allowed when this bit is set. If using shared lock, the wait in `fuse_file_cached_io_open` may be forever. Fixes: 205c1d802683 ("fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP") CC: stable@vger.kernel.org # v6.9 Signed-off-by: yangyun Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b8afeca12487..1b5cd46c8225 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1345,7 +1345,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from /* shared locks are not allowed with parallel page cache IO */ if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) - return false; + return true; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from))