mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-17 02:15:57 +00:00
futex: Fix inode life-time issue
As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn <jannh@google.com> Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
This commit is contained in:
parent
98d54f81e3
commit
8019ad13ef
@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
|
|||||||
inode->i_sb = sb;
|
inode->i_sb = sb;
|
||||||
inode->i_blkbits = sb->s_blocksize_bits;
|
inode->i_blkbits = sb->s_blocksize_bits;
|
||||||
inode->i_flags = 0;
|
inode->i_flags = 0;
|
||||||
|
atomic64_set(&inode->i_sequence, 0);
|
||||||
atomic_set(&inode->i_count, 1);
|
atomic_set(&inode->i_count, 1);
|
||||||
inode->i_op = &empty_iops;
|
inode->i_op = &empty_iops;
|
||||||
inode->i_fop = &no_open_fops;
|
inode->i_fop = &no_open_fops;
|
||||||
|
@ -698,6 +698,7 @@ struct inode {
|
|||||||
struct rcu_head i_rcu;
|
struct rcu_head i_rcu;
|
||||||
};
|
};
|
||||||
atomic64_t i_version;
|
atomic64_t i_version;
|
||||||
|
atomic64_t i_sequence; /* see futex */
|
||||||
atomic_t i_count;
|
atomic_t i_count;
|
||||||
atomic_t i_dio_count;
|
atomic_t i_dio_count;
|
||||||
atomic_t i_writecount;
|
atomic_t i_writecount;
|
||||||
|
@ -31,23 +31,26 @@ struct task_struct;
|
|||||||
|
|
||||||
union futex_key {
|
union futex_key {
|
||||||
struct {
|
struct {
|
||||||
|
u64 i_seq;
|
||||||
unsigned long pgoff;
|
unsigned long pgoff;
|
||||||
struct inode *inode;
|
unsigned int offset;
|
||||||
int offset;
|
|
||||||
} shared;
|
} shared;
|
||||||
struct {
|
struct {
|
||||||
unsigned long address;
|
union {
|
||||||
struct mm_struct *mm;
|
struct mm_struct *mm;
|
||||||
int offset;
|
u64 __tmp;
|
||||||
|
};
|
||||||
|
unsigned long address;
|
||||||
|
unsigned int offset;
|
||||||
} private;
|
} private;
|
||||||
struct {
|
struct {
|
||||||
|
u64 ptr;
|
||||||
unsigned long word;
|
unsigned long word;
|
||||||
void *ptr;
|
unsigned int offset;
|
||||||
int offset;
|
|
||||||
} both;
|
} both;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
|
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = 0ULL } }
|
||||||
|
|
||||||
#ifdef CONFIG_FUTEX
|
#ifdef CONFIG_FUTEX
|
||||||
enum {
|
enum {
|
||||||
|
@ -429,7 +429,7 @@ static void get_futex_key_refs(union futex_key *key)
|
|||||||
|
|
||||||
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
||||||
case FUT_OFF_INODE:
|
case FUT_OFF_INODE:
|
||||||
ihold(key->shared.inode); /* implies smp_mb(); (B) */
|
smp_mb(); /* explicit smp_mb(); (B) */
|
||||||
break;
|
break;
|
||||||
case FUT_OFF_MMSHARED:
|
case FUT_OFF_MMSHARED:
|
||||||
futex_get_mm(key); /* implies smp_mb(); (B) */
|
futex_get_mm(key); /* implies smp_mb(); (B) */
|
||||||
@ -463,7 +463,6 @@ static void drop_futex_key_refs(union futex_key *key)
|
|||||||
|
|
||||||
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
|
||||||
case FUT_OFF_INODE:
|
case FUT_OFF_INODE:
|
||||||
iput(key->shared.inode);
|
|
||||||
break;
|
break;
|
||||||
case FUT_OFF_MMSHARED:
|
case FUT_OFF_MMSHARED:
|
||||||
mmdrop(key->private.mm);
|
mmdrop(key->private.mm);
|
||||||
@ -505,6 +504,46 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
|
|||||||
return timeout;
|
return timeout;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generate a machine wide unique identifier for this inode.
|
||||||
|
*
|
||||||
|
* This relies on u64 not wrapping in the life-time of the machine; which with
|
||||||
|
* 1ns resolution means almost 585 years.
|
||||||
|
*
|
||||||
|
* This further relies on the fact that a well formed program will not unmap
|
||||||
|
* the file while it has a (shared) futex waiting on it. This mapping will have
|
||||||
|
* a file reference which pins the mount and inode.
|
||||||
|
*
|
||||||
|
* If for some reason an inode gets evicted and read back in again, it will get
|
||||||
|
* a new sequence number and will _NOT_ match, even though it is the exact same
|
||||||
|
* file.
|
||||||
|
*
|
||||||
|
* It is important that match_futex() will never have a false-positive, esp.
|
||||||
|
* for PI futexes that can mess up the state. The above argues that false-negatives
|
||||||
|
* are only possible for malformed programs.
|
||||||
|
*/
|
||||||
|
static u64 get_inode_sequence_number(struct inode *inode)
|
||||||
|
{
|
||||||
|
static atomic64_t i_seq;
|
||||||
|
u64 old;
|
||||||
|
|
||||||
|
/* Does the inode already have a sequence number? */
|
||||||
|
old = atomic64_read(&inode->i_sequence);
|
||||||
|
if (likely(old))
|
||||||
|
return old;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
u64 new = atomic64_add_return(1, &i_seq);
|
||||||
|
if (WARN_ON_ONCE(!new))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
|
||||||
|
if (old)
|
||||||
|
return old;
|
||||||
|
return new;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* get_futex_key() - Get parameters which are the keys for a futex
|
* get_futex_key() - Get parameters which are the keys for a futex
|
||||||
* @uaddr: virtual address of the futex
|
* @uaddr: virtual address of the futex
|
||||||
@ -517,9 +556,15 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
|
|||||||
*
|
*
|
||||||
* The key words are stored in @key on success.
|
* The key words are stored in @key on success.
|
||||||
*
|
*
|
||||||
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
|
* For shared mappings (when @fshared), the key is:
|
||||||
* offset_within_page). For private mappings, it's (uaddr, current->mm).
|
* ( inode->i_sequence, page->index, offset_within_page )
|
||||||
* We can usually work out the index without swapping in the page.
|
* [ also see get_inode_sequence_number() ]
|
||||||
|
*
|
||||||
|
* For private mappings (or when !@fshared), the key is:
|
||||||
|
* ( current->mm, address, 0 )
|
||||||
|
*
|
||||||
|
* This allows (cross process, where applicable) identification of the futex
|
||||||
|
* without keeping the page pinned for the duration of the FUTEX_WAIT.
|
||||||
*
|
*
|
||||||
* lock_page() might sleep, the caller should not hold a spinlock.
|
* lock_page() might sleep, the caller should not hold a spinlock.
|
||||||
*/
|
*/
|
||||||
@ -659,8 +704,6 @@ again:
|
|||||||
key->private.mm = mm;
|
key->private.mm = mm;
|
||||||
key->private.address = address;
|
key->private.address = address;
|
||||||
|
|
||||||
get_futex_key_refs(key); /* implies smp_mb(); (B) */
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
struct inode *inode;
|
struct inode *inode;
|
||||||
|
|
||||||
@ -692,40 +735,14 @@ again:
|
|||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Take a reference unless it is about to be freed. Previously
|
|
||||||
* this reference was taken by ihold under the page lock
|
|
||||||
* pinning the inode in place so i_lock was unnecessary. The
|
|
||||||
* only way for this check to fail is if the inode was
|
|
||||||
* truncated in parallel which is almost certainly an
|
|
||||||
* application bug. In such a case, just retry.
|
|
||||||
*
|
|
||||||
* We are not calling into get_futex_key_refs() in file-backed
|
|
||||||
* cases, therefore a successful atomic_inc return below will
|
|
||||||
* guarantee that get_futex_key() will still imply smp_mb(); (B).
|
|
||||||
*/
|
|
||||||
if (!atomic_inc_not_zero(&inode->i_count)) {
|
|
||||||
rcu_read_unlock();
|
|
||||||
put_page(page);
|
|
||||||
|
|
||||||
goto again;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Should be impossible but lets be paranoid for now */
|
|
||||||
if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
|
|
||||||
err = -EFAULT;
|
|
||||||
rcu_read_unlock();
|
|
||||||
iput(inode);
|
|
||||||
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
|
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
|
||||||
key->shared.inode = inode;
|
key->shared.i_seq = get_inode_sequence_number(inode);
|
||||||
key->shared.pgoff = basepage_index(tail);
|
key->shared.pgoff = basepage_index(tail);
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get_futex_key_refs(key); /* implies smp_mb(); (B) */
|
||||||
|
|
||||||
out:
|
out:
|
||||||
put_page(page);
|
put_page(page);
|
||||||
return err;
|
return err;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user