linux/fs/netfs/read_collect.c
David Howells 38cf8e9457
netfs: Fix ceph copy to cache on write-begin
At the end of netfs_unlock_read_folio() in which folios are marked
appropriately for copying to the cache (either with by being marked dirty
and having their private data set or by having PG_private_2 set) and then
unlocked, the folio_queue struct has the entry pointing to the folio
cleared.  This presents a problem for netfs_pgpriv2_write_to_the_cache(),
which is used to write folios marked with PG_private_2 to the cache as it
expects to be able to trawl the folio_queue list thereafter to find the
relevant folios, leading to a hang.

Fix this by not clearing the folio_queue entry if we're going to do the
deprecated copy-to-cache.  The clearance will be done instead as the folios
are written to the cache.

This can be reproduced by starting cachefiles, mounting a ceph filesystem
with "-o fsc" and writing to it.

Fixes: 796a4049640b ("netfs: In readahead, put the folio refs as soon extracted")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Closes: https://lore.kernel.org/r/CAKPOu+_4m80thNy5_fvROoxBm689YtA0dZ-=gcmkzwYSY4syqw@mail.gmail.com/
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20241213135013.2964079-10-dhowells@redhat.com
Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
cc: Jeff Layton <jlayton@kernel.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-12-20 22:07:57 +01:00

552 lines
17 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Network filesystem read subrequest result collection, assessment and
* retrying.
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
/*
* Clear the unread part of an I/O request.
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
netfs_reset_iter(subreq);
WARN_ON_ONCE(subreq->len - subreq->transferred != iov_iter_count(&subreq->io_iter));
iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
if (subreq->start + subreq->transferred >= subreq->rreq->i_size)
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
}
/*
* Flush, mark and unlock a folio that's now completely read. If we want to
* cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
* dirty and let writeback handle it.
*/
static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq,
int slot)
{
struct netfs_folio *finfo;
struct folio *folio = folioq_folio(folioq, slot);
flush_dcache_folio(folio);
folio_mark_uptodate(folio);
if (!test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
finfo = netfs_folio_info(folio);
if (finfo) {
trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
if (finfo->netfs_group)
folio_change_private(folio, finfo->netfs_group);
else
folio_detach_private(folio);
kfree(finfo);
}
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
folio_mark_dirty(folio);
}
} else {
trace_netfs_folio(folio, netfs_folio_trace_read_done);
}
folioq_clear(folioq, slot);
} else {
// TODO: Use of PG_private_2 is deprecated.
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot);
else
folioq_clear(folioq, slot);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
_debug("no unlock");
} else {
trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
folio_unlock(folio);
}
}
}
/*
* Unlock any folios that are now completely read. Returns true if the
* subrequest is removed from the list.
*/
static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async)
{
struct netfs_io_subrequest *prev, *next;
struct netfs_io_request *rreq = subreq->rreq;
struct folio_queue *folioq = subreq->curr_folioq;
size_t avail, prev_donated, next_donated, fsize, part, excess;
loff_t fpos, start;
loff_t fend;
int slot = subreq->curr_folioq_slot;
if (WARN(subreq->transferred > subreq->len,
"Subreq overread: R%x[%x] %zu > %zu",
rreq->debug_id, subreq->debug_index,
subreq->transferred, subreq->len))
subreq->transferred = subreq->len;
next_folio:
fsize = PAGE_SIZE << subreq->curr_folio_order;
fpos = round_down(subreq->start + subreq->consumed, fsize);
fend = fpos + fsize;
if (WARN_ON_ONCE(!folioq) ||
WARN_ON_ONCE(!folioq_folio(folioq, slot)) ||
WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) {
pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len,
slot);
if (folioq) {
struct folio *folio = folioq_folio(folioq, slot);
pr_err("folioq: orders=%02x%02x%02x%02x\n",
folioq->orders[0], folioq->orders[1],
folioq->orders[2], folioq->orders[3]);
if (folio)
pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
fpos, fend - 1, folio_pos(folio), folio_order(folio),
folioq_folio_order(folioq, slot));
}
}
donation_changed:
/* Try to consume the current folio if we've hit or passed the end of
* it. There's a possibility that this subreq doesn't start at the
* beginning of the folio, in which case we need to donate to/from the
* preceding subreq.
*
* We also need to include any potential donation back from the
* following subreq.
*/
prev_donated = READ_ONCE(subreq->prev_donated);
next_donated = READ_ONCE(subreq->next_donated);
if (prev_donated || next_donated) {
spin_lock_bh(&rreq->lock);
prev_donated = subreq->prev_donated;
next_donated = subreq->next_donated;
subreq->start -= prev_donated;
subreq->len += prev_donated;
subreq->transferred += prev_donated;
prev_donated = subreq->prev_donated = 0;
if (subreq->transferred == subreq->len) {
subreq->len += next_donated;
subreq->transferred += next_donated;
next_donated = subreq->next_donated = 0;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
spin_unlock_bh(&rreq->lock);
}
avail = subreq->transferred;
if (avail == subreq->len)
avail += next_donated;
start = subreq->start;
if (subreq->consumed == 0) {
start -= prev_donated;
avail += prev_donated;
} else {
start += subreq->consumed;
avail -= subreq->consumed;
}
part = umin(avail, fsize);
trace_netfs_progress(subreq, start, avail, part);
if (start + avail >= fend) {
if (fpos == start) {
/* Flush, unlock and mark for caching any folio we've just read. */
subreq->consumed = fend - subreq->start;
netfs_unlock_read_folio(subreq, rreq, folioq, slot);
folioq_mark2(folioq, slot);
if (subreq->consumed >= subreq->len)
goto remove_subreq;
} else if (fpos < start) {
excess = fend - subreq->start;
spin_lock_bh(&rreq->lock);
/* If we complete first on a folio split with the
* preceding subreq, donate to that subreq - otherwise
* we get the responsibility.
*/
if (subreq->prev_donated != prev_donated) {
spin_unlock_bh(&rreq->lock);
goto donation_changed;
}
if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) {
spin_unlock_bh(&rreq->lock);
pr_err("Can't donate prior to front\n");
goto bad;
}
prev = list_prev_entry(subreq, rreq_link);
WRITE_ONCE(prev->next_donated, prev->next_donated + excess);
subreq->start += excess;
subreq->len -= excess;
subreq->transferred -= excess;
trace_netfs_donate(rreq, subreq, prev, excess,
netfs_trace_donate_tail_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
if (subreq->consumed >= subreq->len)
goto remove_subreq_locked;
spin_unlock_bh(&rreq->lock);
} else {
pr_err("fpos > start\n");
goto bad;
}
/* Advance the rolling buffer to the next folio. */
slot++;
if (slot >= folioq_nr_slots(folioq)) {
slot = 0;
folioq = folioq->next;
subreq->curr_folioq = folioq;
}
subreq->curr_folioq_slot = slot;
if (folioq && folioq_folio(folioq, slot))
subreq->curr_folio_order = folioq->orders[slot];
if (!was_async)
cond_resched();
goto next_folio;
}
/* Deal with partial progress. */
if (subreq->transferred < subreq->len)
return false;
/* Donate the remaining downloaded data to one of the neighbouring
* subrequests. Note that we may race with them doing the same thing.
*/
spin_lock_bh(&rreq->lock);
if (subreq->prev_donated != prev_donated ||
subreq->next_donated != next_donated) {
spin_unlock_bh(&rreq->lock);
cond_resched();
goto donation_changed;
}
/* Deal with the trickiest case: that this subreq is in the middle of a
* folio, not touching either edge, but finishes first. In such a
* case, we donate to the previous subreq, if there is one and if it is
* contiguous, so that the donation is only handled when that completes
* - and remove this subreq from the list.
*
* If the previous subreq finished first, we will have acquired their
* donation and should be able to unlock folios and/or donate nextwards.
*/
if (!subreq->consumed &&
!prev_donated &&
!list_is_first(&subreq->rreq_link, &rreq->subrequests) &&
subreq->start == prev->start + prev->len) {
prev = list_prev_entry(subreq, rreq_link);
WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
subreq->start += subreq->len;
subreq->len = 0;
subreq->transferred = 0;
trace_netfs_donate(rreq, subreq, prev, subreq->len,
netfs_trace_donate_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
goto remove_subreq_locked;
}
/* If we can't donate down the chain, donate up the chain instead. */
excess = subreq->len - subreq->consumed + next_donated;
if (!subreq->consumed)
excess += prev_donated;
if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
rreq->prev_donated = excess;
trace_netfs_donate(rreq, subreq, NULL, excess,
netfs_trace_donate_to_deferred_next);
} else {
next = list_next_entry(subreq, rreq_link);
WRITE_ONCE(next->prev_donated, excess);
trace_netfs_donate(rreq, subreq, next, excess,
netfs_trace_donate_to_next);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
subreq->len = subreq->consumed;
subreq->transferred = subreq->consumed;
goto remove_subreq_locked;
remove_subreq:
spin_lock_bh(&rreq->lock);
remove_subreq_locked:
subreq->consumed = subreq->len;
list_del(&subreq->rreq_link);
spin_unlock_bh(&rreq->lock);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
return true;
bad:
/* Errr... prev and next both donated to us, but insufficient to finish
* the folio.
*/
printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len);
printk("folio: %llx-%llx\n", fpos, fend - 1);
printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
printk("s=%llx av=%zx part=%zx\n", start, avail, part);
BUG();
}
/*
* Do page flushing and suchlike after DIO.
*/
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
unsigned int i;
/* Collect unbuffered reads and direct reads, adding up the transfer
* sizes until we find the first short or failed subrequest.
*/
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
rreq->transferred += subreq->transferred;
if (subreq->transferred < subreq->len ||
test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
rreq->error = subreq->error;
break;
}
}
if (rreq->origin == NETFS_DIO_READ) {
for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
// TODO: cifs marks pages in the destination buffer
// dirty under some circumstances after a read. Do we
// need to do that too?
set_page_dirty(rreq->direct_bv[i].bv_page);
}
}
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
if (rreq->iocb->ki_complete)
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
if (rreq->origin == NETFS_DIO_READ)
inode_dio_end(rreq->inode);
}
/*
* Assess the state of a read request and decide what to do next.
*
* Note that we're in normal kernel thread context at this point, possibly
* running on a workqueue.
*/
static void netfs_rreq_assess(struct netfs_io_request *rreq)
{
trace_netfs_rreq(rreq, netfs_rreq_trace_assess);
//netfs_rreq_is_still_valid(rreq);
if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) {
netfs_retry_reads(rreq);
return;
}
if (rreq->origin == NETFS_DIO_READ ||
rreq->origin == NETFS_READ_GAPS)
netfs_rreq_assess_dio(rreq);
task_io_account_read(rreq->transferred);
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
trace_netfs_rreq(rreq, netfs_rreq_trace_done);
netfs_clear_subrequests(rreq, false);
netfs_unlock_abandoned_read_pages(rreq);
if (unlikely(test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)))
netfs_pgpriv2_write_to_the_cache(rreq);
}
void netfs_read_termination_worker(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
netfs_see_request(rreq, netfs_rreq_trace_see_work);
netfs_rreq_assess(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete);
}
/*
* Handle the completion of all outstanding I/O operations on a read request.
* We inherit a ref from the caller.
*/
void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async)
{
if (!was_async)
return netfs_rreq_assess(rreq);
if (!work_pending(&rreq->work)) {
netfs_get_request(rreq, netfs_rreq_trace_get_work);
if (!queue_work(system_unbound_wq, &rreq->work))
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq);
}
}
/**
* netfs_read_subreq_progress - Note progress of a read operation.
* @subreq: The read request that has terminated.
* @was_async: True if we're in an asynchronous context.
*
* This tells the read side of netfs lib that a contributory I/O operation has
* made some progress and that it may be possible to unlock some folios.
*
* Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/
void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq,
bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
if (subreq->transferred > subreq->consumed &&
(rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) {
netfs_consume_read_data(subreq, was_async);
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
}
}
EXPORT_SYMBOL(netfs_read_subreq_progress);
/**
* netfs_read_subreq_terminated - Note the termination of an I/O operation.
* @subreq: The I/O request that has terminated.
* @error: Error code indicating type of completion.
* @was_async: The termination was asynchronous
*
* This tells the read helper that a contributory I/O operation has terminated,
* one way or another, and that it should integrate the results.
*
* The caller indicates the outcome of the operation through @error, supplying
* 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY
* is set) or a negative error code. The helper will look after reissuing I/O
* operations as appropriate and writing downloaded data to the cache.
*
* Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
int error, bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
netfs_stat(&netfs_n_rh_read_done);
break;
case NETFS_DOWNLOAD_FROM_SERVER:
netfs_stat(&netfs_n_rh_download_done);
break;
default:
break;
}
if (rreq->origin != NETFS_DIO_READ) {
/* Collect buffered reads.
*
* If the read completed validly short, then we can clear the
* tail before going on to unlock the folios.
*/
if (error == 0 && subreq->transferred < subreq->len &&
(test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
netfs_clear_unread(subreq);
subreq->transferred = subreq->len;
trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
}
if (subreq->transferred > subreq->consumed &&
(rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) {
netfs_consume_read_data(subreq, was_async);
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
}
rreq->transferred += subreq->transferred;
}
/* Deal with retry requests, short reads and errors. If we retry
* but don't make progress, we abandon the attempt.
*/
if (!error && subreq->transferred < subreq->len) {
if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
} else {
trace_netfs_sreq(subreq, netfs_sreq_trace_short);
if (subreq->transferred > subreq->consumed) {
/* If we didn't read new data, abandon retry. */
if (subreq->retry_count &&
test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
}
} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
} else {
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
error = -ENODATA;
}
}
}
subreq->error = error;
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
if (unlikely(error < 0)) {
trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
if (subreq->source == NETFS_READ_FROM_CACHE) {
netfs_stat(&netfs_n_rh_read_failed);
} else {
netfs_stat(&netfs_n_rh_download_failed);
set_bit(NETFS_RREQ_FAILED, &rreq->flags);
rreq->error = subreq->error;
}
}
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, was_async);
netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_read_subreq_terminated);