diff --git a/MAINTAINERS b/MAINTAINERS index 41c9c8f2b96d..58548f79e50a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5056,6 +5056,7 @@ M: Johannes Weiner M: Michal Hocko M: Roman Gushchin M: Shakeel Butt +R: Muchun Song L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained @@ -16097,7 +16098,6 @@ F: include/asm-generic/syscall.h F: include/linux/ptrace.h F: include/linux/regset.h F: include/uapi/linux/ptrace.h -F: include/uapi/linux/ptrace.h F: kernel/ptrace.c PULSE8-CEC DRIVER diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index 1931a04af85a..4d180d96f09e 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -353,7 +353,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) return 0; return __csum_and_copy(src, dst, len); } -EXPORT_SYMBOL(csum_and_copy_from_user); __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) diff --git a/arch/arm/kernel/crash_dump.c b/arch/arm/kernel/crash_dump.c index 53cb92435392..938bd932df9a 100644 --- a/arch/arm/kernel/crash_dump.c +++ b/arch/arm/kernel/crash_dump.c @@ -14,22 +14,10 @@ #include #include #include +#include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is int he user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -40,14 +28,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user(buf, vaddr + offset, csize)) { - iounmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); return csize; diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index 58303a9ec32c..670e4ce81822 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -9,25 +9,11 @@ #include #include #include -#include -#include +#include #include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -38,14 +24,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else { - memcpy(buf, vaddr + offset, csize); - } + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index 0ed3c3dee4cd..4ef68e2aa757 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -10,42 +10,18 @@ #include #include #include - +#include #include -#include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. - */ -ssize_t -copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; if (!csize) return 0; vaddr = __va(pfn<cr_ifs; diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 753642366e12..53735b1d1be3 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -309,7 +309,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) /* * Lower 4 bits are used as a count. Upper bits are a sequence * number that is updated when count is reset. The cmpxchg will - * fail is seqno has changed. This minimizes mutiple cpus + * fail is seqno has changed. This minimizes multiple cpus * resetting the count. */ if (current_jiffies > last.time) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 3c3e15b22608..855d949d81df 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -449,7 +449,7 @@ mem_init (void) memblock_free_all(); /* - * For fsyscall entrpoints with no light-weight handler, use the ordinary + * For fsyscall entrypoints with no light-weight handler, use the ordinary * (heavy-weight) handler, but mark it by setting bit 0, so the fsyscall entry * code can tell them apart. */ diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 135b5135cace..ca060e7a2a46 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -174,7 +174,7 @@ __setup("nptcg=", set_nptcg); * override table (in which case we should ignore the value from * PAL_VM_SUMMARY). * - * Kernel parameter "nptcg=" overrides maximum number of simultanesous ptc.g + * Kernel parameter "nptcg=" overrides maximum number of simultaneous ptc.g * purges defined in either PAL_VM_SUMMARY or PAL override table. In this case, * we should ignore the value from either PAL_VM_SUMMARY or PAL override table. * @@ -516,7 +516,7 @@ int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size) if (i >= per_cpu(ia64_tr_num, cpu)) return -EBUSY; - /*Record tr info for mca hander use!*/ + /*Record tr info for mca handler use!*/ if (i > per_cpu(ia64_tr_used, cpu)) per_cpu(ia64_tr_used, cpu) = i; diff --git a/arch/m68k/lib/checksum.c b/arch/m68k/lib/checksum.c index 7e6afeae6217..5acb821849d3 100644 --- a/arch/m68k/lib/checksum.c +++ b/arch/m68k/lib/checksum.c @@ -265,8 +265,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) return sum; } -EXPORT_SYMBOL(csum_and_copy_from_user); - /* * copy from kernel space while checksumming, otherwise like csum_partial diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c index 2e50f55185a6..6e50f4902409 100644 --- a/arch/mips/kernel/crash_dump.c +++ b/arch/mips/kernel/crash_dump.c @@ -1,22 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -24,14 +12,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 5693e1c67c2b..32b4a97f1b79 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -68,33 +68,8 @@ void __init setup_kdump_trampoline(void) } #endif /* CONFIG_NONSTATIC_KERNEL */ -static size_t copy_oldmem_vaddr(void *vaddr, char *buf, size_t csize, - unsigned long offset, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *)buf, (vaddr + offset), csize)) - return -EFAULT; - } else - memcpy(buf, (vaddr + offset), csize); - - return csize; -} - -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; phys_addr_t paddr; @@ -107,10 +82,10 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (memblock_is_region_memory(paddr, csize)) { vaddr = __va(paddr); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); } else { vaddr = ioremap_cache(paddr, PAGE_SIZE); - csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); } diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index f3999cbb2fcc..1a14c8780278 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -24,7 +24,6 @@ __wsum csum_and_copy_from_user(const void __user *src, void *dst, user_read_access_end(); return csum; } -EXPORT_SYMBOL(csum_and_copy_from_user); __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) { @@ -38,4 +37,3 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len) user_write_access_end(); return csum; } -EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c index 86cc0ada5752..ea2158cee97b 100644 --- a/arch/riscv/kernel/crash_dump.c +++ b/arch/riscv/kernel/crash_dump.c @@ -7,22 +7,10 @@ #include #include +#include -/** - * copy_oldmem_page() - copy one page from old kernel memory - * @pfn: page frame number to be copied - * @buf: buffer where the copied page is placed - * @csize: number of bytes to copy - * @offset: offset in bytes into the page - * @userbuf: if set, @buf is in a user address space - * - * This function copies one page from old kernel memory into buffer pointed by - * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes - * copied or negative error in case of failure. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void *vaddr; @@ -33,13 +21,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { - memunmap(vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); memunmap(vaddr); return csize; diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 69819b765250..a2c1c55daec0 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -212,8 +213,8 @@ static int copy_oldmem_user(void __user *dst, unsigned long src, size_t count) /* * Copy one page from "oldmem" */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { unsigned long src; int rc; @@ -221,10 +222,12 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!csize) return 0; src = pfn_to_phys(pfn) + offset; - if (userbuf) - rc = copy_oldmem_user((void __force __user *) buf, src, csize); + + /* XXX: pass the iov_iter down to a common function */ + if (iter_is_iovec(iter)) + rc = copy_oldmem_user(iter->iov->iov_base, src, csize); else - rc = copy_oldmem_kernel((void *) buf, src, csize); + rc = copy_oldmem_kernel(iter->kvec->iov_base, src, csize); return rc; } diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 5b41b59698c1..19ce6a950aac 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -8,23 +8,11 @@ #include #include #include +#include #include -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { void __iomem *vaddr; @@ -32,15 +20,8 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); - - if (userbuf) { - if (copy_to_user((void __user *)buf, (vaddr + offset), csize)) { - iounmap(vaddr); - return -EFAULT; - } - } else - memcpy(buf, (vaddr + offset), csize); - + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap(vaddr); + return csize; } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 5fcac46aaf6b..5f4ae5476e19 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,8 +10,7 @@ #include #include #include - -#include +#include static inline bool is_crashed_pfn_valid(unsigned long pfn) { @@ -29,21 +28,8 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) #endif } -/** - * copy_oldmem_page - copy one page from "oldmem" - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from "oldmem". For this page, there might be no pte mapped - * in the current kernel. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { void *vaddr; @@ -54,14 +40,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, return -EFAULT; vaddr = kmap_local_pfn(pfn); - - if (!userbuf) { - memcpy(buf, vaddr + offset, csize); - } else { - if (copy_to_user(buf, vaddr + offset, csize)) - csize = -EFAULT; - } - + csize = copy_to_iter(vaddr + offset, csize, iter); kunmap_local(vaddr); return csize; diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 97529552dd24..e75bc2f217ff 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -8,12 +8,12 @@ #include #include -#include +#include #include #include -static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf, +static ssize_t __copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset, bool encrypted) { void *vaddr; @@ -29,50 +29,36 @@ static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, if (!vaddr) return -ENOMEM; - if (userbuf) { - if (copy_to_user((void __user *)buf, vaddr + offset, csize)) { - iounmap((void __iomem *)vaddr); - return -EFAULT; - } - } else - memcpy(buf, vaddr + offset, csize); + csize = copy_to_iter(vaddr + offset, csize, iter); iounmap((void __iomem *)vaddr); return csize; } -/** - * copy_oldmem_page - copy one page of memory - * @pfn: page frame number to be copied - * @buf: target memory address for the copy; this can be in kernel address - * space or user address space (see @userbuf) - * @csize: number of bytes to copy - * @offset: offset in bytes into the page (based on pfn) to begin the copy - * @userbuf: if set, @buf is in user address space, use copy_to_user(), - * otherwise @buf is in kernel address space, use memcpy(). - * - * Copy a page from the old kernel's memory. For this page, there is no pte - * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic. - */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(struct iov_iter *iter, unsigned long pfn, size_t csize, + unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false); + return __copy_oldmem_page(iter, pfn, csize, offset, false); } -/** +/* * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the * memory with the encryption mask set to accommodate kdump on SME-enabled * machines. */ -ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset) { - return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true); + return __copy_oldmem_page(iter, pfn, csize, offset, true); } ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)); } diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 189344924a2b..145f9a0bde29 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -32,7 +32,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_from_user); /** * csum_and_copy_to_user - Copy and checksum to user space. @@ -57,7 +56,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len) user_access_end(); return sum; } -EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 7df466e22282..2cdc054e53a5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -915,7 +915,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, goto err_req; } - if (xfer->length + xfer->offset > map->size) { + if (xfer->length + xfer->offset > req->map->size) { ret = -EINVAL; goto err_req; } @@ -927,7 +927,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, } sg_set_buf(req->sgt.sgl, - map->virt_addr + (baddr - map->phys_addr) + + req->map->virt_addr + (baddr - req->map->phys_addr) + xfer->offset, xfer->length); } diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 646510476c30..bfa431a8e690 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -175,7 +175,8 @@ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; - p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); + p = kmalloc(sizeof(struct tty_buffer) + 2 * size, + GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 02d4d4234956..a415c02ede39 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -126,6 +126,7 @@ struct msdos_inode_info { struct hlist_node i_fat_hash; /* hash by i_location */ struct hlist_node i_dir_hash; /* hash by i_logstart */ struct rw_semaphore truncate_lock; /* protect bmap against truncate */ + struct timespec64 i_crtime; /* File creation (birth) time */ struct inode vfs_inode; }; @@ -433,8 +434,15 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...); __fat_fs_error(sb, 1, fmt , ## args) #define fat_fs_error_ratelimit(sb, fmt, args...) \ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) + +#define FAT_PRINTK_PREFIX "%sFAT-fs (%s): " +#define fat_msg(sb, level, fmt, args...) \ +do { \ + printk_index_subsys_emit(FAT_PRINTK_PREFIX, level, fmt, ##args);\ + _fat_msg(sb, level, fmt, ##args); \ +} while (0) __printf(3, 4) __cold -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); #define fat_msg_ratelimit(sb, level, fmt, args...) \ do { \ if (__ratelimit(&MSDOS_SB(sb)->ratelimit)) \ @@ -446,6 +454,10 @@ extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); +extern struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); +extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); extern int fat_update_time(struct inode *inode, struct timespec64 *now, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 978ac6751aeb..1db348f8f887 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -94,7 +94,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } @@ -107,8 +108,8 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", - (llu)blocknr); + fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)", + (llu)blocknr); return -EIO; } fatent->nr_bhs = 1; diff --git a/fs/fat/file.c b/fs/fat/file.c index bf91f977debe..3dae3ed60f3a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -398,13 +398,21 @@ int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(mnt_userns, inode, stat); - stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { + generic_fillattr(mnt_userns, inode, stat); + stat->blksize = sbi->cluster_size; + + if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { /* Use i_pos for ino. This is used as fileid of nfs. */ - stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode); + stat->ino = fat_i_pos_read(sbi, inode); } + + if (sbi->options.isvfat && request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = MSDOS_I(inode)->i_crtime; + } + return 0; } EXPORT_SYMBOL_GPL(fat_getattr); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 69b4d4ae64d7..a38238d75c08 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -567,12 +567,13 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); + inode->i_ctime = inode->i_mtime; if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, - de->cdate, de->ctime_cs); fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, + de->cdate, de->ctime_cs); } else - fat_truncate_time(inode, &inode->i_mtime, S_ATIME|S_CTIME); + inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); return 0; } @@ -757,6 +758,8 @@ static struct inode *fat_alloc_inode(struct super_block *sb) ei->i_logstart = 0; ei->i_attrs = 0; ei->i_pos = 0; + ei->i_crtime.tv_sec = 0; + ei->i_crtime.tv_nsec = 0; return &ei->vfs_inode; } @@ -888,10 +891,10 @@ static int __fat_write_inode(struct inode *inode, int wait) &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, - &raw_entry->cdate, &raw_entry->ctime_cs); fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); + fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, + &raw_entry->cdate, &raw_entry->ctime_cs); } spin_unlock(&sbi->inode_hash_lock); mark_buffer_dirty(bh); @@ -1885,10 +1888,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: - if (fsinfo_inode) - iput(fsinfo_inode); - if (fat_inode) - iput(fat_inode); + iput(fsinfo_inode); + iput(fat_inode); unload_nls(sbi->nls_io); unload_nls(sbi->nls_disk); fat_reset_iocharset(&sbi->options); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 91ca3c304211..7e5d6ae305f2 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -42,10 +42,16 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) EXPORT_SYMBOL_GPL(__fat_fs_error); /** - * fat_msg() - print preformated FAT specific messages. Every thing what is - * not fat_fs_error() should be fat_msg(). + * _fat_msg() - Print a preformatted FAT message based on a superblock. + * @sb: A pointer to a &struct super_block + * @level: A Kernel printk level constant + * @fmt: The printf-style format string to print. + * + * Everything that is not fat_fs_error() should be fat_msg(). + * + * fat_msg() wraps _fat_msg() for printk indexing. */ -void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +void _fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -53,7 +59,7 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + _printk(FAT_PRINTK_PREFIX "%pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -187,7 +193,7 @@ static long days_in_year[] = { 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; -static inline int fat_tz_offset(struct msdos_sb_info *sbi) +static inline int fat_tz_offset(const struct msdos_sb_info *sbi) { return (sbi->options.tz_set ? -sbi->options.time_offset : @@ -275,23 +281,35 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } -static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) +/* + * truncate atime to 24 hour granularity (00:00:00 in local timezone) + */ +struct timespec64 fat_truncate_atime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) { - if (ts.tv_nsec) - ts.tv_nsec -= ts.tv_nsec % 10000000UL; - return ts; + /* to localtime */ + time64_t seconds = ts->tv_sec - fat_tz_offset(sbi); + s32 remainder; + + div_s64_rem(seconds, SECS_PER_DAY, &remainder); + /* to day boundary, and back to unix time */ + seconds = seconds + fat_tz_offset(sbi) - remainder; + + return (struct timespec64){ seconds, 0 }; +} + +/* + * truncate mtime to 2 second granularity + */ +struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, + const struct timespec64 *ts) +{ + return fat_timespec64_trunc_2secs(*ts); } /* * truncate the various times with appropriate granularity: - * root inode: - * all times always 0 - * all other inodes: - * mtime - 2 seconds - * ctime - * msdos - 2 seconds - * vfat - 10 milliseconds - * atime - 24 hours (00:00:00 in local timezone) + * all times in root node are always 0 */ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) { @@ -306,25 +324,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) ts = current_time(inode); } - if (flags & S_ATIME) { - /* to localtime */ - time64_t seconds = now->tv_sec - fat_tz_offset(sbi); - s32 remainder; - - div_s64_rem(seconds, SECS_PER_DAY, &remainder); - /* to day boundary, and back to unix time */ - seconds = seconds + fat_tz_offset(sbi) - remainder; - - inode->i_atime = (struct timespec64){ seconds, 0 }; - } - if (flags & S_CTIME) { - if (sbi->options.isvfat) - inode->i_ctime = fat_timespec64_trunc_10ms(*now); - else - inode->i_ctime = fat_timespec64_trunc_2secs(*now); - } + if (flags & S_ATIME) + inode->i_atime = fat_truncate_atime(sbi, now); + /* + * ctime and mtime share the same on-disk field, and should be + * identical in memory. all mtime updates will be applied to ctime, + * but ctime updates are ignored. + */ if (flags & S_MTIME) - inode->i_mtime = fat_timespec64_trunc_2secs(*now); + inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); return 0; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5369d82e0bfb..c573314806cf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -780,8 +780,6 @@ static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, goto out; } inode_inc_iversion(inode); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); out: @@ -878,8 +876,6 @@ static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } inode_inc_iversion(inode); set_nlink(inode, 2); - fat_truncate_time(inode, &ts, S_ATIME|S_CTIME|S_MTIME); - /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); diff --git a/fs/namei.c b/fs/namei.c index 896ade8b7400..3dc0db2df561 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1032,7 +1032,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1041,7 +1041,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, @@ -1050,7 +1050,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, @@ -1059,7 +1059,7 @@ static struct ctl_table namei_sysctls[] = { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), - .mode = 0600, + .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e1392a9b8ceb..a8abe2296514 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1772,11 +1772,11 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, last_vcn = -1; do { VCN vcn; - pgoff_t idx, start_idx; + pgoff_t start_idx; unsigned ofs, do_pages, u; size_t copied; - start_idx = idx = pos >> PAGE_SHIFT; + start_idx = pos >> PAGE_SHIFT; ofs = pos & ~PAGE_MASK; bytes = PAGE_SIZE - ofs; do_pages = 1; diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 5781b9e8e3d8..0c6de6287737 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -668,9 +668,11 @@ static u32 format_size_gb(const u64 bytes, u32 *mb) static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { - return boot->sectors_per_clusters <= 0x80 - ? boot->sectors_per_clusters - : (1u << (0 - boot->sectors_per_clusters)); + if (boot->sectors_per_clusters <= 0x80) + return boot->sectors_per_clusters; + if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ + return 1U << (0 - boot->sectors_per_clusters); + return -EINVAL; } /* @@ -713,6 +715,8 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); + if ((int)sct_per_clst < 0) + goto out; if (!is_power_of_2(sct_per_clst)) goto out; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index d442cf5dda8a..be5e9ed7da8d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -541,7 +541,7 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; struct dlm_lock_resource *oldres = dl->dl_res; - struct dlm_lock_resource *res = NULL; + struct dlm_lock_resource *res = NULL, *iter; struct list_head *track_list; spin_lock(&dlm->track_lock); @@ -556,11 +556,11 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) } } - list_for_each_entry(res, track_list, tracking) { - if (&res->tracking == &dlm->tracking_list) - res = NULL; - else - dlm_lockres_get(res); + list_for_each_entry(iter, track_list, tracking) { + if (&iter->tracking != &dlm->tracking_list) { + dlm_lockres_get(iter); + res = iter; + } break; } spin_unlock(&dlm->track_lock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 61103b2d69fb..7318e4794ef9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -392,9 +392,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct dlm_lock *lock = NULL; + struct dlm_lock *lock = NULL, *iter; enum dlm_status status = DLM_NORMAL; - int found = 0, i; + int i; struct dlm_lockstatus *lksb = NULL; int ignore; u32 flags; @@ -437,7 +437,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } queue=&res->granted; - found = 0; spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { spin_unlock(&res->spinlock); @@ -461,21 +460,21 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each_entry(lock, queue, list) { - if (lock->ml.cookie == unlock->cookie && - lock->ml.node == unlock->node_idx) { - dlm_lock_get(lock); - found = 1; + list_for_each_entry(iter, queue, list) { + if (iter->ml.cookie == unlock->cookie && + iter->ml.node == unlock->node_idx) { + dlm_lock_get(iter); + lock = iter; break; } } - if (found) + if (lock) break; /* scan granted -> converting -> blocked queues */ queue++; } spin_unlock(&res->spinlock); - if (!found) { + if (!lock) { status = DLM_IVLOCKID; goto not_found; } @@ -505,7 +504,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_kick_thread(dlm, res); not_found: - if (!found) + if (!lock) mlog(ML_ERROR, "failed to find lock to unlock! " "cookie=%u:%llu\n", dlm_get_lock_cookie_node(be64_to_cpu(unlock->cookie)), diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 29f183a15798..617c92e7b925 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -433,6 +433,11 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres, } spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + spin_unlock(&lockres->l_lock); + status = -EAGAIN; + goto bail; + } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, @@ -595,7 +600,7 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); - return 0; + goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; @@ -609,22 +614,30 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) } if (lockres->l_ro_holders || lockres->l_ex_holders) { + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + /* + * lock is never requested, leave USER_LOCK_IN_TEARDOWN set + * to avoid new lock request coming in. + */ spin_unlock(&lockres->l_lock); goto bail; } - lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 5739dc301569..bb116c39b581 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,6 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; + journal_t *journal = osb->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -171,11 +172,10 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (osb->journal) { + if (journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); - journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 1887a2708709..fa87d89cf754 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,22 +810,20 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +/* + * alloc & initialize skeleton for journal structure. + * ocfs2_journal_init() will make fs have journal ability. + */ +int ocfs2_journal_alloc(struct ocfs2_super *osb) { - int status = -1; - struct inode *inode = NULL; /* the journal inode */ - journal_t *j_journal = NULL; - struct ocfs2_journal *journal = NULL; - struct ocfs2_dinode *di = NULL; - struct buffer_head *bh = NULL; - int inode_lock = 0; + int status = 0; + struct ocfs2_journal *journal; - /* initialize our journal structure */ journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); if (!journal) { mlog(ML_ERROR, "unable to alloc journal\n"); status = -ENOMEM; - goto done; + goto bail; } osb->journal = journal; journal->j_osb = osb; @@ -839,6 +837,21 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; +bail: + return status; +} + +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + int inode_lock = 0; + + BUG_ON(!journal); /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, osb->slot_num); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 8dcb2f2cadbc..969d0aa28718 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -154,6 +154,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * Journal Control: * Initialize, Load, Shutdown, Wipe a journal. * + * ocfs2_journal_alloc - Initialize skeleton for journal structure. * ocfs2_journal_init - Initialize journal structures in the OSB. * ocfs2_journal_load - Load the given journal off disk. Replay it if * there's transactions still in there. @@ -167,6 +168,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_alloc(struct ocfs2_super *osb); int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..5022b3e9bfcd 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -921,19 +921,19 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, { struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; - struct ocfs2_quota_chunk *chunk; + struct ocfs2_quota_chunk *chunk = NULL, *iter; struct ocfs2_local_disk_chunk *dchunk; int found = 0, len; - list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + list_for_each_entry(iter, &oinfo->dqi_chunk, qc_chunk) { dchunk = (struct ocfs2_local_disk_chunk *) - chunk->qc_headerbh->b_data; + iter->qc_headerbh->b_data; if (le32_to_cpu(dchunk->dqc_free) > 0) { - found = 1; + chunk = iter; break; } } - if (!found) + if (!chunk) return NULL; if (chunk->qc_num < oinfo->dqi_chunks - 1) { diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 769e466887b0..a9d1296d736d 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -198,7 +198,7 @@ void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, resv->r_flags |= flags; } -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); @@ -207,8 +207,6 @@ int ocfs2_resmap_init(struct ocfs2_super *osb, resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); - - return 0; } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 677c50663595..ec8101ef5717 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -73,15 +73,10 @@ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, /** * ocfs2_resmap_init() - Initialize fields of a reservations bitmap + * @osb: struct ocfs2_super to be saved in resmap * @resmap: struct ocfs2_reservation_map to initialize - * @obj: unused for now - * @ops: unused for now - * @max_bitmap_bytes: Maximum size of the bitmap (typically blocksize) - * - * Only possible return value other than '0' is -ENOMEM for failure to - * allocation mirror bitmap. */ -int ocfs2_resmap_init(struct ocfs2_super *osb, +void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap); /** diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 477cdf94122e..f7298816d8d9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -989,28 +989,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { status = -EINVAL; - goto read_super_error; + goto out; } /* probe for superblock */ status = ocfs2_sb_probe(sb, &bh, §or_size, &stats); if (status < 0) { mlog(ML_ERROR, "superblock probe failed!\n"); - goto read_super_error; + goto out; } status = ocfs2_initialize_super(sb, bh, sector_size, &stats); - osb = OCFS2_SB(sb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } brelse(bh); bh = NULL; + if (status < 0) + goto out; + + osb = OCFS2_SB(sb); if (!ocfs2_check_set_options(sb, &parsed_options)) { status = -EINVAL; - goto read_super_error; + goto out_super; } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; @@ -1027,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) - goto read_super_error; + goto out_super; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -1041,7 +1040,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EACCES; mlog(ML_ERROR, "Readonly device detected but readonly " "mount was not specified.\n"); - goto read_super_error; + goto out_super; } /* You should not be able to start a local heartbeat @@ -1050,7 +1049,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -EROFS; mlog(ML_ERROR, "Local heartbeat specified on readonly " "device.\n"); - goto read_super_error; + goto out_super; } status = ocfs2_check_journals_nolocks(osb); @@ -1059,9 +1058,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) mlog(ML_ERROR, "Recovery required on readonly " "file system, but write access is " "unavailable.\n"); - else - mlog_errno(status); - goto read_super_error; + goto out_super; } ocfs2_set_ro_flag(osb, 1); @@ -1077,10 +1074,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } status = ocfs2_verify_heartbeat(osb); - if (status < 0) { - mlog_errno(status); - goto read_super_error; - } + if (status < 0) + goto out_super; osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); @@ -1094,15 +1089,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = ocfs2_mount_volume(sb); if (status < 0) - goto read_super_error; + goto out_debugfs; if (osb->root_inode) inode = igrab(osb->root_inode); if (!inode) { status = -EIO; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL, @@ -1110,7 +1104,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) if (!osb->osb_dev_kset) { status = -ENOMEM; mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } /* Create filecheck sysfs related directories/files at @@ -1119,14 +1113,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) status = -ENOMEM; mlog(ML_ERROR, "Unable to create filecheck sysfs directory at " "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id); - goto read_super_error; + goto out_dismount; } root = d_make_root(inode); if (!root) { status = -ENOMEM; - mlog_errno(status); - goto read_super_error; + goto out_dismount; } sb->s_root = root; @@ -1178,17 +1171,21 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; -read_super_error: - brelse(bh); +out_dismount: + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + goto out; - if (status) - mlog_errno(status); - - if (osb) { - atomic_set(&osb->vol_state, VOLUME_DISABLED); - wake_up(&osb->osb_mount_event); - ocfs2_dismount_volume(sb, 1); - } +out_debugfs: + debugfs_remove_recursive(osb->osb_debug_root); +out_super: + ocfs2_release_system_inodes(osb); + kfree(osb->recovery_map); + ocfs2_delete_osb(osb); + kfree(osb); +out: + mlog_errno(status); return status; } @@ -1803,11 +1800,10 @@ static int ocfs2_get_sector(struct super_block *sb, static int ocfs2_mount_volume(struct super_block *sb) { int status = 0; - int unlock_super = 0; struct ocfs2_super *osb = OCFS2_SB(sb); if (ocfs2_is_hard_readonly(osb)) - goto leave; + goto out; mutex_init(&osb->obs_trim_fs_mutex); @@ -1817,44 +1813,56 @@ static int ocfs2_mount_volume(struct super_block *sb) if (status == -EBADR && ocfs2_userspace_stack(osb)) mlog(ML_ERROR, "couldn't mount because cluster name on" " disk does not match the running cluster name.\n"); - goto leave; + goto out; } status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); - goto leave; + goto out_dlm; } - unlock_super = 1; /* This will load up the node map and add ourselves to it. */ status = ocfs2_find_slot(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } /* load all node-local system inodes */ status = ocfs2_init_local_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_super_lock; } status = ocfs2_check_volume(osb); if (status < 0) { mlog_errno(status); - goto leave; + goto out_system_inodes; } status = ocfs2_truncate_log_init(osb); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out_system_inodes; + } -leave: - if (unlock_super) - ocfs2_super_unlock(osb, 1); + ocfs2_super_unlock(osb, 1); + return 0; +out_system_inodes: + if (osb->local_alloc_state == OCFS2_LA_ENABLED) + ocfs2_shutdown_local_alloc(osb); + ocfs2_release_system_inodes(osb); + /* before journal shutdown, we should release slot_info */ + ocfs2_free_slot_info(osb); + ocfs2_journal_shutdown(osb); +out_super_lock: + ocfs2_super_unlock(osb, 1); +out_dlm: + ocfs2_dlm_shutdown(osb, 0); +out: return status; } @@ -2022,7 +2030,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out; } sb->s_fs_info = osb; @@ -2083,7 +2091,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Invalid number of node slots (%u)\n", osb->max_slots); status = -EINVAL; - goto bail; + goto out; } ocfs2_orphan_scan_init(osb); @@ -2092,7 +2100,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); mlog_errno(status); - goto bail; + goto out; } init_waitqueue_head(&osb->checkpoint_event); @@ -2110,17 +2118,13 @@ static int ocfs2_initialize_super(struct super_block *sb, init_waitqueue_head(&osb->osb_mount_event); - status = ocfs2_resmap_init(osb, &osb->osb_la_resmap); - if (status) { - mlog_errno(status); - goto bail; - } + ocfs2_resmap_init(osb, &osb->osb_la_resmap); osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); if (!osb->vol_label) { mlog(ML_ERROR, "unable to alloc vol label\n"); status = -ENOMEM; - goto bail; + goto out_recovery_map; } osb->slot_recovery_generations = @@ -2129,7 +2133,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->slot_recovery_generations) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_vol_label; } init_waitqueue_head(&osb->osb_wipe_event); @@ -2139,7 +2143,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_orphan_wipes) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_slot_recovery_gen; } osb->osb_rf_lock_tree = RB_ROOT; @@ -2155,13 +2159,13 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "couldn't mount because of unsupported " "optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (!sb_rdonly(osb->sb) && (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { mlog(ML_ERROR, "couldn't mount RDWR because of " "unsupported optional features (%x).\n", i); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } if (ocfs2_clusterinfo_valid(osb)) { @@ -2182,7 +2186,7 @@ static int ocfs2_initialize_super(struct super_block *sb, "cluster stack label (%s) \n", osb->osb_cluster_stack); status = -EINVAL; - goto bail; + goto out_orphan_wipes; } memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, @@ -2195,6 +2199,15 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); + /* + * FIXME + * This should be done in ocfs2_journal_init(), but any inode + * writes back operation will cause the filesystem to crash. + */ + status = ocfs2_journal_alloc(osb); + if (status < 0) + goto out_orphan_wipes; + INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2208,7 +2221,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", osb->s_clustersize); status = -EINVAL; - goto bail; + goto out_journal; } total_blocks = ocfs2_clusters_to_blocks(osb->sb, @@ -2220,14 +2233,14 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog(ML_ERROR, "Volume too large " "to mount safely on this system"); status = -EFBIG; - goto bail; + goto out_journal; } if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, sizeof(di->id2.i_super.s_uuid))) { mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); status = -ENOMEM; - goto bail; + goto out_journal; } strlcpy(osb->vol_label, di->id2.i_super.s_label, @@ -2247,7 +2260,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!osb->osb_dlm_debug) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_uuid_str; } atomic_set(&osb->vol_state, VOLUME_INIT); @@ -2256,7 +2269,7 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_global_system_inodes(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_dlm_out; } /* @@ -2267,7 +2280,7 @@ static int ocfs2_initialize_super(struct super_block *sb, if (!inode) { status = -EINVAL; mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; @@ -2280,16 +2293,39 @@ static int ocfs2_initialize_super(struct super_block *sb, status = ocfs2_init_slot_info(osb); if (status < 0) { mlog_errno(status); - goto bail; + goto out_system_inodes; } osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { status = -ENOMEM; mlog_errno(status); + goto out_slot_info; } -bail: + return status; + +out_slot_info: + ocfs2_free_slot_info(osb); +out_system_inodes: + ocfs2_release_system_inodes(osb); +out_dlm_out: + ocfs2_put_dlm_debug(osb->osb_dlm_debug); +out_uuid_str: + kfree(osb->uuid_str); +out_journal: + kfree(osb->journal); +out_orphan_wipes: + kfree(osb->osb_orphan_wipes); +out_slot_recovery_gen: + kfree(osb->slot_recovery_generations); +out_vol_label: + kfree(osb->vol_label); +out_recovery_map: + kfree(osb->recovery_map); +out: + kfree(osb); + sb->s_fs_info = NULL; return status; } @@ -2483,6 +2519,12 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the middle of ocfs2_initialize_super(), + * we free it here. + */ + kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); diff --git a/fs/pipe.c b/fs/pipe.c index 11358569a777..74ae9fafd25a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -653,7 +653,7 @@ pipe_poll(struct file *filp, poll_table *wait) unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ - pipe->poll_usage = 1; + WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. diff --git a/fs/proc/generic.c b/fs/proc/generic.c index f2132407e133..587b91d9d998 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -448,6 +448,9 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, proc_set_user(ent, (*parent)->uid, (*parent)->gid); ent->proc_dops = &proc_misc_dentry_ops; + /* Revalidate everything under /proc/${pid}/net */ + if ((*parent)->proc_dops == &proc_net_dentry_ops) + pde_force_lookup(ent); out: return ent; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 982e694aae77..dff921f7ca33 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -479,10 +479,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * the previous entry, search for a matching entry. */ if (!m || start < m->addr || start >= m->addr + m->size) { - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && - start < m->addr + m->size) + struct kcore_list *iter; + + m = NULL; + list_for_each_entry(iter, &kclist_head, list) { + if (start >= iter->addr && + start < iter->addr + iter->size) { + m = iter; break; + } } } @@ -492,12 +497,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) page_offline_freeze(); } - if (&m->list == &kclist_head) { + if (!m) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; } - m = NULL; /* skip the list anchor */ goto skip; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index e1cfeda397f3..913e5acefbb6 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -376,6 +376,9 @@ static __net_init int proc_net_ns_init(struct net *net) proc_set_user(netd, uid, gid); + /* Seed dentry revalidation for /proc/${pid}/net */ + pde_force_lookup(netd); + err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 6f1b8ddc6f7a..4eaeb645e759 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -128,9 +129,8 @@ static int open_vmcore(struct inode *inode, struct file *file) } /* Reads a page from the oldmem device from given offset. */ -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { unsigned long pfn, offset; size_t nr_bytes; @@ -152,29 +152,23 @@ ssize_t read_from_oldmem(char *buf, size_t count, /* If pfn is not ram, return zeros for sparse dump files */ if (!pfn_is_ram(pfn)) { - tmp = 0; - if (!userbuf) - memset(buf, 0, nr_bytes); - else if (clear_user(buf, nr_bytes)) - tmp = -EFAULT; + tmp = iov_iter_zero(nr_bytes, iter); } else { if (encrypted) - tmp = copy_oldmem_page_encrypted(pfn, buf, + tmp = copy_oldmem_page_encrypted(iter, pfn, nr_bytes, - offset, - userbuf); + offset); else - tmp = copy_oldmem_page(pfn, buf, nr_bytes, - offset, userbuf); + tmp = copy_oldmem_page(iter, pfn, nr_bytes, + offset); } - if (tmp < 0) { + if (tmp < nr_bytes) { srcu_read_unlock(&vmcore_cb_srcu, idx); - return tmp; + return -EFAULT; } *ppos += nr_bytes; count -= nr_bytes; - buf += nr_bytes; read += nr_bytes; ++pfn; offset = 0; @@ -203,7 +197,12 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, false); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, false); } /* @@ -211,7 +210,13 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + struct kvec kvec = { .iov_base = buf, .iov_len = count }; + struct iov_iter iter; + + iov_iter_kvec(&iter, READ, &kvec, 1, count); + + return read_from_oldmem(&iter, count, ppos, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -228,29 +233,14 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, /* * Architectures which support memory encryption override this. */ -ssize_t __weak -copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, - unsigned long offset, int userbuf) +ssize_t __weak copy_oldmem_page_encrypted(struct iov_iter *iter, + unsigned long pfn, size_t csize, unsigned long offset) { - return copy_oldmem_page(pfn, buf, csize, offset, userbuf); -} - -/* - * Copy to either kernel or user space - */ -static int copy_to(void *target, void *src, size_t size, int userbuf) -{ - if (userbuf) { - if (copy_to_user((char __user *) target, src, size)) - return -EFAULT; - } else { - memcpy(target, src, size); - } - return 0; + return copy_oldmem_page(iter, pfn, csize, offset); } #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP -static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +static int vmcoredd_copy_dumps(struct iov_iter *iter, u64 start, size_t size) { struct vmcoredd_node *dump; u64 offset = 0; @@ -263,14 +253,13 @@ static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (copy_to(dst, buf, tsz, userbuf)) { + if (copy_to_iter(buf, tsz, iter) < tsz) { ret = -EFAULT; goto out_unlock; } size -= tsz; start += tsz; - dst += tsz; /* Leave now if buffer filled already */ if (!size) @@ -326,33 +315,28 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, - int userbuf) +static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) { ssize_t acc = 0, tmp; size_t tsz; u64 start; struct vmcore *m = NULL; - if (buflen == 0 || *fpos >= vmcore_size) + if (!iov_iter_count(iter) || *fpos >= vmcore_size) return 0; - /* trim buflen to not go beyond EOF */ - if (buflen > vmcore_size - *fpos) - buflen = vmcore_size - *fpos; + iov_iter_truncate(iter, vmcore_size - *fpos); /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { - tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) + tsz = min(elfcorebuf_sz - (size_t)*fpos, iov_iter_count(iter)); + if (copy_to_iter(elfcorebuf + *fpos, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -373,35 +357,32 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, /* Read device dumps */ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - - (size_t)*fpos, buflen); + (size_t)*fpos, iov_iter_count(iter)); start = *fpos - elfcorebuf_sz; - if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + if (vmcoredd_copy_dumps(iter, start, tsz)) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (!buflen) + if (!iov_iter_count(iter)) return acc; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ /* Read remaining elf notes */ - tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); + tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, + iov_iter_count(iter)); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; - if (copy_to(buffer, kaddr, tsz, userbuf)) + if (copy_to_iter(kaddr, tsz, iter) < tsz) return -EFAULT; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } @@ -409,19 +390,17 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < m->offset + m->size) { tsz = (size_t)min_t(unsigned long long, m->offset + m->size - *fpos, - buflen); + iov_iter_count(iter)); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); + tmp = read_from_oldmem(iter, tsz, &start, + cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; - buflen -= tsz; *fpos += tsz; - buffer += tsz; acc += tsz; /* leave now if filled buffer already */ - if (buflen == 0) + if (!iov_iter_count(iter)) return acc; } } @@ -429,15 +408,14 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, return acc; } -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t read_vmcore(struct kiocb *iocb, struct iov_iter *iter) { - return __read_vmcore((__force char *) buffer, buflen, fpos, 1); + return __read_vmcore(iter, &iocb->ki_pos); } /* * The vmcore fault handler uses the page cache and fills data using the - * standard __vmcore_read() function. + * standard __read_vmcore() function. * * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). @@ -447,9 +425,10 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; + struct iov_iter iter; + struct kvec kvec; struct page *page; loff_t offset; - char *buf; int rc; page = find_or_create_page(mapping, index, GFP_KERNEL); @@ -457,8 +436,11 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (!PageUptodate(page)) { offset = (loff_t) index << PAGE_SHIFT; - buf = __va((page_to_pfn(page) << PAGE_SHIFT)); - rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + kvec.iov_base = page_address(page); + kvec.iov_len = PAGE_SIZE; + iov_iter_kvec(&iter, READ, &kvec, 1, PAGE_SIZE); + + rc = __read_vmcore(&iter, &offset); if (rc < 0) { unlock_page(page); put_page(page); @@ -708,7 +690,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) static const struct proc_ops vmcore_proc_ops = { .proc_open = open_vmcore, - .proc_read = read_vmcore, + .proc_read_iter = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, }; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index d1def0771a40..3365a30dc1e0 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -312,7 +312,9 @@ static int complete_read_super(struct super_block *sb, int silent, int size) sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - + if (sbi->s_firstdatazone < sbi->s_firstinodezone) + return 0; + sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 620821549b23..0f3a656293b0 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -24,11 +24,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); -extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, - unsigned long, int); -extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, - int userbuf); +ssize_t copy_oldmem_page(struct iov_iter *i, unsigned long pfn, size_t csize, + unsigned long offset); +ssize_t copy_oldmem_page_encrypted(struct iov_iter *iter, unsigned long pfn, + size_t csize, unsigned long offset); void vmcore_cleanup(void); @@ -135,13 +134,11 @@ static inline int vmcore_add_device_dump(struct vmcoredd_data *data) #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #ifdef CONFIG_PROC_VMCORE -ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted); +ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted); #else -static inline ssize_t read_from_oldmem(char *buf, size_t count, - u64 *ppos, int userbuf, - bool encrypted) +static inline ssize_t read_from_oldmem(struct iov_iter *iter, size_t count, + u64 *ppos, bool encrypted) { return -EOPNOTSUPP; } diff --git a/include/linux/list.h b/include/linux/list.h index 57e8b559cdf6..61762054b4be 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -35,7 +35,7 @@ static inline void INIT_LIST_HEAD(struct list_head *list) { WRITE_ONCE(list->next, list); - list->prev = list; + WRITE_ONCE(list->prev, list); } #ifdef CONFIG_DEBUG_LIST @@ -306,7 +306,7 @@ static inline int list_empty(const struct list_head *head) static inline void list_del_init_careful(struct list_head *entry) { __list_del_entry(entry); - entry->prev = entry; + WRITE_ONCE(entry->prev, entry); smp_store_release(&entry->next, entry); } @@ -326,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return list_is_head(next, head) && (next == head->prev); + return list_is_head(next, head) && (next == READ_ONCE(head->prev)); } /** diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index c00c618ef290..cb0fd633a610 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -71,7 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; - unsigned int poll_usage; + bool poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 15b3d176b6b4..db4509587d2c 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -30,7 +30,7 @@ extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 -#define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ +#define PT_DTRACE 0x00000002 /* delayed trace (used on um) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ diff --git a/include/uapi/linux/acct.h b/include/uapi/linux/acct.h index 985b89068591..0e591152aa8a 100644 --- a/include/uapi/linux/acct.h +++ b/include/uapi/linux/acct.h @@ -103,12 +103,13 @@ struct acct_v3 /* * accounting flags */ - /* bit set when the process ... */ + /* bit set when the process/task ... */ #define AFORK 0x01 /* ... executed fork, but did not exec */ #define ASU 0x02 /* ... used super-user privileges */ #define ACOMPAT 0x04 /* ... used compatibility mode (VAX only not used) */ #define ACORE 0x08 /* ... dumped core */ #define AXSIG 0x10 /* ... was killed by a signal */ +#define AGROUP 0x20 /* ... was the last task of the process (task group) */ #if defined(__BYTE_ORDER) ? __BYTE_ORDER == __BIG_ENDIAN : defined(__BIG_ENDIAN) #define ACCT_BYTEORDER 0x80 /* accounting file is big endian */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 429bec8dd70a..c4abd09c3da9 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -134,7 +134,7 @@ typedef __s64 Elf64_Sxword; #define STT_TLS 6 #define ELF_ST_BIND(x) ((x) >> 4) -#define ELF_ST_TYPE(x) (((unsigned int) x) & 0xf) +#define ELF_ST_TYPE(x) ((x) & 0xf) #define ELF32_ST_BIND(x) ELF_ST_BIND(x) #define ELF32_ST_TYPE(x) ELF_ST_TYPE(x) #define ELF64_ST_BIND(x) ELF_ST_BIND(x) diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 12327d32378f..736154171489 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 11 +#define TASKSTATS_VERSION 12 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -48,7 +48,8 @@ struct taskstats { __u32 ac_exitcode; /* Exit status */ /* The accounting flags of a task as defined in - * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG. + * Defined values are AFORK, ASU, ACOMPAT, ACORE, AXSIG, and AGROUP. + * (AGROUP since version 12). */ __u8 ac_flag; /* Record flags */ __u8 ac_nice; /* task_nice */ @@ -173,9 +174,26 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ - /* Delay waiting for memory compact */ + /* v11: Delay waiting for memory compact */ __u64 compact_count; __u64 compact_delay_total; + + /* v12 begin */ + __u32 ac_tgid; /* thread group ID */ + /* Thread group walltime up to now. This is total process walltime if + * AGROUP flag is set. + */ + __u64 ac_tgetime __attribute__((aligned(8))); + /* Lightweight information to identify process binary files. + * This leaves userspace to match this to a file system path, using + * MAJOR() and MINOR() macros to identify a device and mount point, + * the inode to identify the executable file. This is /proc/self/exe + * at the end, so matching the most recent exec(). Values are zero + * for kernel threads. + */ + __u64 ac_exe_dev; /* program binary device ID */ + __u64 ac_exe_inode; /* program binary inode number */ + /* v12 end */ }; diff --git a/init/Kconfig b/init/Kconfig index 8aed339b8e06..7efe7c56893f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -423,8 +423,8 @@ config CROSS_MEMORY_ATTACH See the man page for more details. config USELIB - bool "uselib syscall" - def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION + bool "uselib syscall (for libc5 and earlier)" + default ALPHA || M68K || SPARC help This option enables the uselib syscall, a system call used in the dynamic linker from libc5 and earlier. glibc does not use this @@ -1348,6 +1348,16 @@ config BOOT_CONFIG If unsure, say Y. +config INITRAMFS_PRESERVE_MTIME + bool "Preserve cpio archive mtimes in initramfs" + default y + help + Each entry in an initramfs cpio archive carries an mtime value. When + enabled, extracted cpio items take this mtime, with directory mtime + setting deferred until after creation of any child entries. + + If unsure, say Y. + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE diff --git a/init/initramfs.c b/init/initramfs.c index 2f3d96dc3db6..dc84cf756cea 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -17,8 +17,11 @@ #include #include -static ssize_t __init xwrite(struct file *file, const char *p, size_t count, - loff_t *pos) +static __initdata bool csum_present; +static __initdata u32 io_csum; + +static ssize_t __init xwrite(struct file *file, const unsigned char *p, + size_t count, loff_t *pos) { ssize_t out = 0; @@ -33,6 +36,13 @@ static ssize_t __init xwrite(struct file *file, const char *p, size_t count, } else if (rv == 0) break; + if (csum_present) { + ssize_t i; + + for (i = 0; i < rv; i++) + io_csum += p[i]; + } + p += rv; out += rv; count -= rv; @@ -116,31 +126,36 @@ static void __init free_hash(void) } } -static long __init do_utime(char *filename, time64_t mtime) +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME +static void __init do_utime(char *filename, time64_t mtime) { - struct timespec64 t[2]; + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + init_utimes(filename, t); +} - t[0].tv_sec = mtime; - t[0].tv_nsec = 0; - t[1].tv_sec = mtime; - t[1].tv_nsec = 0; - return init_utimes(filename, t); +static void __init do_utime_path(const struct path *path, time64_t mtime) +{ + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; - char *name; time64_t mtime; + char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { - struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + size_t nlen = strlen(name) + 1; + struct dir_entry *de; + + de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); - de->name = kstrdup(name, GFP_KERNEL); + strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } @@ -151,10 +166,15 @@ static void __init dir_utime(void) list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); - kfree(de->name); kfree(de); } } +#else +static void __init do_utime(char *filename, time64_t mtime) {} +static void __init do_utime_path(const struct path *path, time64_t mtime) {} +static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_utime(void) {} +#endif static __initdata time64_t mtime; @@ -166,15 +186,16 @@ static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; +static __initdata u32 hdr_csum; static void __init parse_header(char *s) { - unsigned long parsed[12]; + unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; - for (i = 0, s += 6; i < 12; i++, s += 8) { + for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } @@ -189,6 +210,7 @@ static void __init parse_header(char *s) minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; + hdr_csum = parsed[12]; } /* FSM */ @@ -257,12 +279,15 @@ static int __init do_collect(void) static int __init do_header(void) { - if (memcmp(collected, "070707", 6)==0) { - error("incorrect cpio method used: use -H newc option"); - return 1; - } - if (memcmp(collected, "070701", 6)) { - error("no cpio magic"); + if (!memcmp(collected, "070701", 6)) { + csum_present = false; + } else if (!memcmp(collected, "070702", 6)) { + csum_present = true; + } else { + if (memcmp(collected, "070707", 6) == 0) + error("incorrect cpio method used: use -H newc option"); + else + error("no cpio magic"); return 1; } parse_header(collected); @@ -353,6 +378,7 @@ static int __init do_name(void) if (IS_ERR(wfile)) return 0; wfile_pos = 0; + io_csum = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); @@ -380,15 +406,13 @@ static int __init do_name(void) static int __init do_copy(void) { if (byte_count >= body_len) { - struct timespec64 t[2] = { }; if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); - t[0].tv_sec = mtime; - t[1].tv_sec = mtime; - vfs_utimes(&wfile->f_path, t); - + do_utime_path(&wfile->f_path, mtime); fput(wfile); + if (csum_present && io_csum != hdr_csum) + error("bad data checksum"); eat(body_len); state = SkipIt; return 0; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 7c08eb3c258d..54cb6264f8cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -45,6 +45,7 @@ struct mqueue_fs_context { struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 @@ -427,6 +428,14 @@ static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); + } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } @@ -454,6 +463,10 @@ static int mqueue_init_fs_context(struct fs_context *fc) return 0; } +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; @@ -465,6 +478,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) return ERR_CAST(fc); ctx = fc->fs_private; + ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); diff --git a/ipc/sem.c b/ipc/sem.c index 0dbdb98fdf2d..c8496f98b139 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -766,7 +766,6 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; - result = curr->semval; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; @@ -1430,7 +1429,6 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (err) goto out_rcu_wakeup; - err = -EACCES; switch (cmd) { case GETALL: { @@ -1995,7 +1993,9 @@ long __do_semtimedop(int semid, struct sembuf *sops, int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long dup = 0, jiffies_left = 0; + unsigned long dup = 0; + ktime_t expires, *exp = NULL; + bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; @@ -2003,12 +2003,11 @@ long __do_semtimedop(int semid, struct sembuf *sops, return -E2BIG; if (timeout) { - if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 || - timeout->tv_nsec >= 1000000000L) { - error = -EINVAL; - goto out; - } - jiffies_left = timespec64_to_jiffies(timeout); + if (!timespec64_valid(timeout)) + return -EINVAL; + expires = ktime_add_safe(ktime_get(), + timespec64_to_ktime(*timeout)); + exp = &expires; } @@ -2166,10 +2165,8 @@ long __do_semtimedop(int semid, struct sembuf *sops, sem_unlock(sma, locknum); rcu_read_unlock(); - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + timed_out = !schedule_hrtimeout_range(exp, + current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or @@ -2210,7 +2207,7 @@ long __do_semtimedop(int semid, struct sembuf *sops, /* * If an interrupt occurred we have to clean up the queue. */ - if (timeout && jiffies_left == 0) + if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4d57c03714f4..71122e01623c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -222,9 +222,6 @@ static __init char *get_last_crashkernel(char *cmdline, p = strstr(p+1, name); } - if (!ck_cmdline) - return NULL; - return ck_cmdline; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 02a65d554340..80bfea5dd5c4 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -73,7 +73,7 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; * hung task is detected: */ unsigned int __read_mostly sysctl_hung_task_panic = - CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC); static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/kernel/kcov.c b/kernel/kcov.c index b3732b210593..e19c84b02452 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -204,8 +204,16 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first 64-bit word is the number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = ip; + /* Previously we write pc before updating pos. However, some + * early interrupt code could bypass check_kcov_mode() check + * and invoke __sanitizer_cov_trace_pc(). If such interrupt is + * raised between writing pc and updating pos, the pc could be + * overitten by the recursive __sanitizer_cov_trace_pc(). + * Update pos before writing pc to avoid such interleaving. + */ WRITE_ONCE(area[0], pos); + barrier(); + area[pos] = ip; } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); @@ -236,11 +244,13 @@ static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) start_index = 1 + count * KCOV_WORDS_PER_CMP; end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); if (likely(end_pos <= max_pos)) { + /* See comment in __sanitizer_cov_trace_pc(). */ + WRITE_ONCE(area[0], count + 1); + barrier(); area[start_index] = type; area[start_index + 1] = arg1; area[start_index + 2] = arg2; area[start_index + 3] = ip; - WRITE_ONCE(area[0], count + 1); } } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8b3e9a2014cf..4d34c78334ce 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -768,7 +768,6 @@ static struct page *kimage_alloc_page(struct kimage *image, kimage_free_pages(old_page); continue; } - addr = old_addr; page = old_page; break; } @@ -788,7 +787,6 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; - result = 0; if (image->file_mode) kbuf = segment->kbuf; else diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a46a3723bc66..f4f8cb0435b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -52,7 +52,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, 0); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ccc4b465775b..49c29baf9907 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -829,11 +829,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task, } #endif -#ifdef PTRACE_SINGLESTEP #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) -#else -#define is_singlestep(request) 0 -#endif #ifdef PTRACE_SINGLEBLOCK #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) @@ -1221,9 +1217,7 @@ int ptrace_request(struct task_struct *child, long request, } #endif -#ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: -#endif #ifdef PTRACE_SINGLEBLOCK case PTRACE_SINGLEBLOCK: #endif diff --git a/kernel/relay.c b/kernel/relay.c index d1a67fbb819d..6a611e779e95 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -440,7 +440,7 @@ int relay_prepare_cpu(unsigned int cpu) mutex_lock(&relay_channels_mutex); list_for_each_entry(chan, &relay_channels, list) { - if ((buf = *per_cpu_ptr(chan->buf, cpu))) + if (*per_cpu_ptr(chan->buf, cpu)) continue; buf = relay_open_buf(chan, cpu); if (!buf) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index bcac5a9043aa..f7e246336218 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -153,6 +154,23 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } +static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + /* No idea if I'm allowed to access that here, now. */ + struct file *exe_file = get_task_exe_file(tsk); + + if (exe_file) { + /* Following cp_new_stat64() in stat.c . */ + stats->ac_exe_dev = + huge_encode_dev(exe_file->f_inode->i_sb->s_dev); + stats->ac_exe_inode = exe_file->f_inode->i_ino; + fput(exe_file); + } else { + stats->ac_exe_dev = 0; + stats->ac_exe_inode = 0; + } +} + static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) @@ -175,6 +193,9 @@ static void fill_stats(struct user_namespace *user_ns, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); + + /* add executable info */ + exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) @@ -620,6 +641,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); + if (group_dead) + stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving @@ -665,6 +688,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .netnsok = true, }; /* Needed early in initialization */ diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 1d261fbe367b..4252f0645b9e 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -23,15 +23,20 @@ void bacct_add_tsk(struct user_namespace *user_ns, { const struct cred *tcred; u64 utime, stime, utimescaled, stimescaled; - u64 delta; + u64 now_ns, delta; time64_t btime; BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); /* calculate task elapsed time in nsec */ - delta = ktime_get_ns() - tsk->start_time; + now_ns = ktime_get_ns(); + /* store whole group time first */ + delta = now_ns - tsk->group_leader->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); + stats->ac_tgetime = delta; + delta = now_ns - tsk->start_time; + do_div(delta, NSEC_PER_USEC); stats->ac_etime = delta; /* Convert to seconds for btime (note y2106 limit) */ btime = ktime_get_real_seconds() - div_u64(delta, USEC_PER_SEC); @@ -51,6 +56,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_nice = task_nice(tsk); stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); + stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 40024e03d422..20a7a55e62b6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,7 +57,7 @@ int __read_mostly sysctl_hardlockup_all_cpu_backtrace; * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -168,7 +168,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index b8cc65d22169..b1e506f1d328 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1073,13 +1073,6 @@ config BOOTPARAM_SOFTLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE - int - depends on SOFTLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC - default 1 if BOOTPARAM_SOFTLOCKUP_PANIC - config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR @@ -1121,13 +1114,6 @@ config BOOTPARAM_HARDLOCKUP_PANIC Say N if unsure. -config BOOTPARAM_HARDLOCKUP_PANIC_VALUE - int - depends on HARDLOCKUP_DETECTOR - range 0 1 - default 0 if !BOOTPARAM_HARDLOCKUP_PANIC - default 1 if BOOTPARAM_HARDLOCKUP_PANIC - config DETECT_HUNG_TASK bool "Detect Hung Tasks" depends on DEBUG_KERNEL @@ -1175,13 +1161,6 @@ config BOOTPARAM_HUNG_TASK_PANIC Say N if unsure. -config BOOTPARAM_HUNG_TASK_PANIC_VALUE - int - depends on DETECT_HUNG_TASK - range 0 1 - default 0 if !BOOTPARAM_HUNG_TASK_PANIC - default 1 if BOOTPARAM_HUNG_TASK_PANIC - config WQ_WATCHDOG bool "Detect Workqueue Stalls" depends on DEBUG_KERNEL diff --git a/lib/glob.c b/lib/glob.c index 85ecbda45cd8..15b73f490720 100644 --- a/lib/glob.c +++ b/lib/glob.c @@ -45,7 +45,7 @@ bool __pure glob_match(char const *pat, char const *str) * (no exception for /), it can be easily proved that there's * never a need to backtrack multiple levels. */ - char const *back_pat = NULL, *back_str = back_str; + char const *back_pat = NULL, *back_str; /* * Loop over each token (character or class) in pat, matching diff --git a/lib/string.c b/lib/string.c index 485777c9da83..6f334420f687 100644 --- a/lib/string.c +++ b/lib/string.c @@ -517,21 +517,13 @@ EXPORT_SYMBOL(strnlen); size_t strspn(const char *s, const char *accept) { const char *p; - const char *a; - size_t count = 0; for (p = s; *p != '\0'; ++p) { - for (a = accept; *a != '\0'; ++a) { - if (*p == *a) - break; - } - if (*a == '\0') - return count; - ++count; + if (!strchr(accept, *p)) + break; } - return count; + return p - s; } - EXPORT_SYMBOL(strspn); #endif @@ -544,17 +536,12 @@ EXPORT_SYMBOL(strspn); size_t strcspn(const char *s, const char *reject) { const char *p; - const char *r; - size_t count = 0; for (p = s; *p != '\0'; ++p) { - for (r = reject; *r != '\0'; ++r) { - if (*p == *r) - return count; - } - ++count; + if (strchr(reject, *p)) + break; } - return count; + return p - s; } EXPORT_SYMBOL(strcspn); #endif diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 4f877e9551d5..5ed3beb066e6 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -757,6 +757,9 @@ char **devm_kasprintf_strarray(struct device *dev, const char *prefix, size_t n) return ERR_PTR(-ENOMEM); } + ptr->n = n; + devres_add(dev, ptr); + return ptr->array; } EXPORT_SYMBOL_GPL(devm_kasprintf_strarray); diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 3ca717f11397..c95db11a6906 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -279,13 +279,18 @@ static int __init do_kmem_cache_rcu_persistent(int size, int *total_failures) c = kmem_cache_create("test_cache", size, size, SLAB_TYPESAFE_BY_RCU, NULL); buf = kmem_cache_alloc(c, GFP_KERNEL); + if (!buf) + goto out; saved_ptr = buf; fill_with_garbage(buf, size); buf_contents = kmalloc(size, GFP_KERNEL); - if (!buf_contents) + if (!buf_contents) { + kmem_cache_free(c, buf); goto out; + } used_objects = kmalloc_array(maxiter, sizeof(void *), GFP_KERNEL); if (!used_objects) { + kmem_cache_free(c, buf); kfree(buf_contents); goto out; } @@ -306,11 +311,14 @@ static int __init do_kmem_cache_rcu_persistent(int size, int *total_failures) } } + for (iter = 0; iter < maxiter; iter++) + kmem_cache_free(c, used_objects[iter]); + free_out: - kmem_cache_destroy(c); kfree(buf_contents); kfree(used_objects); out: + kmem_cache_destroy(c); *total_failures += fail; return 1; } diff --git a/lib/test_string.c b/lib/test_string.c index 9dfd6f52de92..c5cb92fb710e 100644 --- a/lib/test_string.c +++ b/lib/test_string.c @@ -179,6 +179,34 @@ static __init int strnchr_selftest(void) return 0; } +static __init int strspn_selftest(void) +{ + static const struct strspn_test { + const char str[16]; + const char accept[16]; + const char reject[16]; + unsigned a; + unsigned r; + } tests[] __initconst = { + { "foobar", "", "", 0, 6 }, + { "abba", "abc", "ABBA", 4, 4 }, + { "abba", "a", "b", 1, 1 }, + { "", "abc", "abc", 0, 0}, + }; + const struct strspn_test *s = tests; + size_t i, res; + + for (i = 0; i < ARRAY_SIZE(tests); ++i, ++s) { + res = strspn(s->str, s->accept); + if (res != s->a) + return 0x100 + 2*i; + res = strcspn(s->str, s->reject); + if (res != s->r) + return 0x100 + 2*i + 1; + } + return 0; +} + static __exit void string_selftest_remove(void) { } @@ -212,6 +240,11 @@ static __init int string_selftest_init(void) if (subtest) goto fail; + test = 6; + subtest = strspn_selftest(); + if (subtest) + goto fail; + pr_info("String selftests succeeded\n"); return 0; fail: diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index dcd8d8750b8b..4dd6a804ce41 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -36,6 +36,7 @@ def getsizes(file, format): if name.startswith("__se_compat_sys"): continue if name.startswith("__addressable_"): continue if name == "linux_banner": continue + if name == "vermagic": continue # statics and some other optimizations adds random .NUMBER name = re_NUMBER.sub('', name) sym[name] = sym.get(name, 0) + int(size, 16) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 5fbad61fe490..7075e26ab2c4 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -45,8 +45,13 @@ else fi fi -declare -A cache -declare -A modcache +declare aarray_support=true +declare -A cache 2>/dev/null +if [[ $? != 0 ]]; then + aarray_support=false +else + declare -A modcache +fi find_module() { if [[ -n $debuginfod ]] ; then @@ -97,7 +102,7 @@ parse_symbol() { if [[ $module == "" ]] ; then local objfile=$vmlinux - elif [[ "${modcache[$module]+isset}" == "isset" ]]; then + elif [[ $aarray_support == true && "${modcache[$module]+isset}" == "isset" ]]; then local objfile=${modcache[$module]} else local objfile=$(find_module) @@ -105,7 +110,9 @@ parse_symbol() { echo "WARNING! Modules path isn't set, but is needed to parse this symbol" >&2 return fi - modcache[$module]=$objfile + if [[ $aarray_support == true ]]; then + modcache[$module]=$objfile + fi fi # Remove the englobing parenthesis @@ -125,7 +132,7 @@ parse_symbol() { # Use 'nm vmlinux' to figure out the base address of said symbol. # It's actually faster to call it every time than to load it # all into bash. - if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then + if [[ $aarray_support == true && "${cache[$module,$name]+isset}" == "isset" ]]; then local base_addr=${cache[$module,$name]} else local base_addr=$(nm "$objfile" 2>/dev/null | awk '$3 == "'$name'" && ($2 == "t" || $2 == "T") {print $1; exit}') @@ -133,7 +140,9 @@ parse_symbol() { # address not found return fi - cache[$module,$name]="$base_addr" + if [[ $aarray_support == true ]]; then + cache[$module,$name]="$base_addr" + fi fi # Let's start doing the math to get the exact address into the # symbol. First, strip out the symbol total length. @@ -149,11 +158,13 @@ parse_symbol() { # Pass it to addr2line to get filename and line number # Could get more than one result - if [[ "${cache[$module,$address]+isset}" == "isset" ]]; then + if [[ $aarray_support == true && "${cache[$module,$address]+isset}" == "isset" ]]; then local code=${cache[$module,$address]} else local code=$(${CROSS_COMPILE}addr2line -i -e "$objfile" "$address" 2>/dev/null) - cache[$module,$address]=$code + if [[ $aarray_support == true ]]; then + cache[$module,$address]=$code + fi fi # addr2line doesn't return a proper error code if it fails, so diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 6bd5221d37b8..ab123b498fd9 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -983,6 +983,7 @@ sub get_maintainers { } foreach my $email (@file_emails) { + $email = mailmap_email($email); my ($name, $address) = parse_email($email); my $tmp_email = format_email($name, $address, $email_usename); diff --git a/tools/accounting/.gitignore b/tools/accounting/.gitignore index c45fb4ed4309..522a690aaf3d 100644 --- a/tools/accounting/.gitignore +++ b/tools/accounting/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only getdelays +procacct diff --git a/tools/accounting/Makefile b/tools/accounting/Makefile index 03687f19cbb1..11def1ad046c 100644 --- a/tools/accounting/Makefile +++ b/tools/accounting/Makefile @@ -2,7 +2,7 @@ CC := $(CROSS_COMPILE)gcc CFLAGS := -I../../usr/include -PROGS := getdelays +PROGS := getdelays procacct all: $(PROGS) diff --git a/tools/accounting/procacct.c b/tools/accounting/procacct.c new file mode 100644 index 000000000000..8353d3237e50 --- /dev/null +++ b/tools/accounting/procacct.c @@ -0,0 +1,417 @@ +// SPDX-License-Identifier: GPL-2.0 +/* procacct.c + * + * Demonstrator of fetching resource data on task exit, as a way + * to accumulate accurate program resource usage statistics, without + * prior identification of the programs. For that, the fields for + * device and inode of the program executable binary file are also + * extracted in addition to the command string. + * + * The TGID together with the PID and the AGROUP flag allow + * identification of threads in a process and single-threaded processes. + * The ac_tgetime field gives proper whole-process walltime. + * + * Written (changed) by Thomas Orgis, University of Hamburg in 2022 + * + * This is a cheap derivation (inheriting the style) of getdelays.c: + * + * Utility to get per-pid and per-tgid delay accounting statistics + * Also illustrates usage of the taskstats interface + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2005 + * Copyright (C) Balbir Singh, IBM Corp. 2006 + * Copyright (c) Jay Lan, SGI. 2006 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Generic macros for dealing with netlink sockets. Might be duplicated + * elsewhere. It is recommended that commercial grade applications use + * libnl or libnetlink and use the interfaces provided by the library + */ +#define GENLMSG_DATA(glh) ((void *)(NLMSG_DATA(glh) + GENL_HDRLEN)) +#define GENLMSG_PAYLOAD(glh) (NLMSG_PAYLOAD(glh, 0) - GENL_HDRLEN) +#define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) +#define NLA_PAYLOAD(len) (len - NLA_HDRLEN) + +#define err(code, fmt, arg...) \ + do { \ + fprintf(stderr, fmt, ##arg); \ + exit(code); \ + } while (0) + +int rcvbufsz; +char name[100]; +int dbg; +int print_delays; +int print_io_accounting; +int print_task_context_switch_counts; + +#define PRINTF(fmt, arg...) { \ + if (dbg) { \ + printf(fmt, ##arg); \ + } \ + } + +/* Maximum size of response requested or message sent */ +#define MAX_MSG_SIZE 1024 +/* Maximum number of cpus expected to be specified in a cpumask */ +#define MAX_CPUS 32 + +struct msgtemplate { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[MAX_MSG_SIZE]; +}; + +char cpumask[100+6*MAX_CPUS]; + +static void usage(void) +{ + fprintf(stderr, "procacct [-v] [-w logfile] [-r bufsize] [-m cpumask]\n"); + fprintf(stderr, " -v: debug on\n"); +} + +/* + * Create a raw netlink socket and bind + */ +static int create_nl_socket(int protocol) +{ + int fd; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW, protocol); + if (fd < 0) + return -1; + + if (rcvbufsz) + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, + &rcvbufsz, sizeof(rcvbufsz)) < 0) { + fprintf(stderr, "Unable to set socket rcv buf size to %d\n", + rcvbufsz); + goto error; + } + + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + + if (bind(fd, (struct sockaddr *) &local, sizeof(local)) < 0) + goto error; + + return fd; +error: + close(fd); + return -1; +} + + +static int send_cmd(int sd, __u16 nlmsg_type, __u32 nlmsg_pid, + __u8 genl_cmd, __u16 nla_type, + void *nla_data, int nla_len) +{ + struct nlattr *na; + struct sockaddr_nl nladdr; + int r, buflen; + char *buf; + + struct msgtemplate msg; + + msg.n.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + msg.n.nlmsg_type = nlmsg_type; + msg.n.nlmsg_flags = NLM_F_REQUEST; + msg.n.nlmsg_seq = 0; + msg.n.nlmsg_pid = nlmsg_pid; + msg.g.cmd = genl_cmd; + msg.g.version = 0x1; + na = (struct nlattr *) GENLMSG_DATA(&msg); + na->nla_type = nla_type; + na->nla_len = nla_len + 1 + NLA_HDRLEN; + memcpy(NLA_DATA(na), nla_data, nla_len); + msg.n.nlmsg_len += NLMSG_ALIGN(na->nla_len); + + buf = (char *) &msg; + buflen = msg.n.nlmsg_len; + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + while ((r = sendto(sd, buf, buflen, 0, (struct sockaddr *) &nladdr, + sizeof(nladdr))) < buflen) { + if (r > 0) { + buf += r; + buflen -= r; + } else if (errno != EAGAIN) + return -1; + } + return 0; +} + + +/* + * Probe the controller in genetlink to find the family id + * for the TASKSTATS family + */ +static int get_family_id(int sd) +{ + struct { + struct nlmsghdr n; + struct genlmsghdr g; + char buf[256]; + } ans; + + int id = 0, rc; + struct nlattr *na; + int rep_len; + + strcpy(name, TASKSTATS_GENL_NAME); + rc = send_cmd(sd, GENL_ID_CTRL, getpid(), CTRL_CMD_GETFAMILY, + CTRL_ATTR_FAMILY_NAME, (void *)name, + strlen(TASKSTATS_GENL_NAME)+1); + if (rc < 0) + return 0; /* sendto() failure? */ + + rep_len = recv(sd, &ans, sizeof(ans), 0); + if (ans.n.nlmsg_type == NLMSG_ERROR || + (rep_len < 0) || !NLMSG_OK((&ans.n), rep_len)) + return 0; + + na = (struct nlattr *) GENLMSG_DATA(&ans); + na = (struct nlattr *) ((char *) na + NLA_ALIGN(na->nla_len)); + if (na->nla_type == CTRL_ATTR_FAMILY_ID) + id = *(__u16 *) NLA_DATA(na); + + return id; +} + +#define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) + +static void print_procacct(struct taskstats *t) +{ + /* First letter: T is a mere thread, G the last in a group, U unknown. */ + printf( + "%c pid=%lu tgid=%lu uid=%lu wall=%llu gwall=%llu cpu=%llu vmpeak=%llu rsspeak=%llu dev=%lu:%lu inode=%llu comm=%s\n" + , t->version >= 12 ? (t->ac_flag & AGROUP ? 'P' : 'T') : '?' + , (unsigned long)t->ac_pid + , (unsigned long)(t->version >= 12 ? t->ac_tgid : 0) + , (unsigned long)t->ac_uid + , (unsigned long long)t->ac_etime + , (unsigned long long)(t->version >= 12 ? t->ac_tgetime : 0) + , (unsigned long long)(t->ac_utime+t->ac_stime) + , (unsigned long long)t->hiwater_vm + , (unsigned long long)t->hiwater_rss + , (unsigned long)(t->version >= 12 ? MAJOR(t->ac_exe_dev) : 0) + , (unsigned long)(t->version >= 12 ? MINOR(t->ac_exe_dev) : 0) + , (unsigned long long)(t->version >= 12 ? t->ac_exe_inode : 0) + , t->ac_comm + ); +} + +void handle_aggr(int mother, struct nlattr *na, int fd) +{ + int aggr_len = NLA_PAYLOAD(na->nla_len); + int len2 = 0; + pid_t rtid = 0; + + na = (struct nlattr *) NLA_DATA(na); + while (len2 < aggr_len) { + switch (na->nla_type) { + case TASKSTATS_TYPE_PID: + rtid = *(int *) NLA_DATA(na); + PRINTF("PID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_TGID: + rtid = *(int *) NLA_DATA(na); + PRINTF("TGID\t%d\n", rtid); + break; + case TASKSTATS_TYPE_STATS: + if (mother == TASKSTATS_TYPE_AGGR_PID) + print_procacct((struct taskstats *) NLA_DATA(na)); + if (fd) { + if (write(fd, NLA_DATA(na), na->nla_len) < 0) + err(1, "write error\n"); + } + break; + case TASKSTATS_TYPE_NULL: + break; + default: + fprintf(stderr, "Unknown nested nla_type %d\n", + na->nla_type); + break; + } + len2 += NLA_ALIGN(na->nla_len); + na = (struct nlattr *)((char *)na + + NLA_ALIGN(na->nla_len)); + } +} + +int main(int argc, char *argv[]) +{ + int c, rc, rep_len, aggr_len, len2; + int cmd_type = TASKSTATS_CMD_ATTR_UNSPEC; + __u16 id; + __u32 mypid; + + struct nlattr *na; + int nl_sd = -1; + int len = 0; + pid_t tid = 0; + + int fd = 0; + int write_file = 0; + int maskset = 0; + char *logfile = NULL; + int containerset = 0; + char *containerpath = NULL; + int cfd = 0; + int forking = 0; + sigset_t sigset; + + struct msgtemplate msg; + + while (!forking) { + c = getopt(argc, argv, "m:vr:"); + if (c < 0) + break; + + switch (c) { + case 'w': + logfile = strdup(optarg); + printf("write to file %s\n", logfile); + write_file = 1; + break; + case 'r': + rcvbufsz = atoi(optarg); + printf("receive buf size %d\n", rcvbufsz); + if (rcvbufsz < 0) + err(1, "Invalid rcv buf size\n"); + break; + case 'm': + strncpy(cpumask, optarg, sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + maskset = 1; + break; + case 'v': + printf("debug on\n"); + dbg = 1; + break; + default: + usage(); + exit(-1); + } + } + if (!maskset) { + maskset = 1; + strncpy(cpumask, "1", sizeof(cpumask)); + cpumask[sizeof(cpumask) - 1] = '\0'; + } + printf("cpumask %s maskset %d\n", cpumask, maskset); + + if (write_file) { + fd = open(logfile, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + perror("Cannot open output file\n"); + exit(1); + } + } + + nl_sd = create_nl_socket(NETLINK_GENERIC); + if (nl_sd < 0) + err(1, "error creating Netlink socket\n"); + + mypid = getpid(); + id = get_family_id(nl_sd); + if (!id) { + fprintf(stderr, "Error getting family id, errno %d\n", errno); + goto err; + } + PRINTF("family id %d\n", id); + + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + PRINTF("Sent register cpumask, retval %d\n", rc); + if (rc < 0) { + fprintf(stderr, "error sending register cpumask\n"); + goto err; + } + } + + do { + rep_len = recv(nl_sd, &msg, sizeof(msg), 0); + PRINTF("received %d bytes\n", rep_len); + + if (rep_len < 0) { + fprintf(stderr, "nonfatal reply error: errno %d\n", + errno); + continue; + } + if (msg.n.nlmsg_type == NLMSG_ERROR || + !NLMSG_OK((&msg.n), rep_len)) { + struct nlmsgerr *err = NLMSG_DATA(&msg); + + fprintf(stderr, "fatal reply error, errno %d\n", + err->error); + goto done; + } + + PRINTF("nlmsghdr size=%zu, nlmsg_len=%d, rep_len=%d\n", + sizeof(struct nlmsghdr), msg.n.nlmsg_len, rep_len); + + + rep_len = GENLMSG_PAYLOAD(&msg.n); + + na = (struct nlattr *) GENLMSG_DATA(&msg); + len = 0; + while (len < rep_len) { + len += NLA_ALIGN(na->nla_len); + int mother = na->nla_type; + + PRINTF("mother=%i\n", mother); + switch (na->nla_type) { + case TASKSTATS_TYPE_AGGR_PID: + case TASKSTATS_TYPE_AGGR_TGID: + /* For nested attributes, na follows */ + handle_aggr(mother, na, fd); + break; + default: + fprintf(stderr, "Unexpected nla_type %d\n", + na->nla_type); + case TASKSTATS_TYPE_NULL: + break; + } + na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); + } + } while (1); +done: + if (maskset) { + rc = send_cmd(nl_sd, id, mypid, TASKSTATS_CMD_GET, + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, + &cpumask, strlen(cpumask) + 1); + printf("Sent deregister mask, retval %d\n", rc); + if (rc < 0) + err(rc, "error sending deregister cpumask\n"); + } +err: + close(nl_sd); + if (fd) + close(fd); + if (cfd) + close(cfd); + return 0; +} diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index 0e2c8a5838b1..dc838e26a5b9 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include +#include #include #include #include @@ -20,10 +22,12 @@ #define xstr(s) #s #define str(s) xstr(s) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) static unsigned int offset; static unsigned int ino = 721; static time_t default_mtime; +static bool do_csum = false; struct file_handler { const char *type; @@ -77,7 +81,7 @@ static void cpio_trailer(void) sprintf(s, "%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ 0, /* ino */ 0, /* mode */ (long) 0, /* uid */ @@ -109,7 +113,7 @@ static int cpio_mkslink(const char *name, const char *target, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ S_IFLNK | mode, /* mode */ (long) uid, /* uid */ @@ -158,7 +162,7 @@ static int cpio_mkgeneric(const char *name, unsigned int mode, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -252,7 +256,7 @@ static int cpio_mknod(const char *name, unsigned int mode, name++; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08X%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino++, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -292,19 +296,42 @@ static int cpio_mknod_line(const char *line) return rc; } +static int cpio_mkfile_csum(int fd, unsigned long size, uint32_t *csum) +{ + while (size) { + unsigned char filebuf[65536]; + ssize_t this_read; + size_t i, this_size = MIN(size, sizeof(filebuf)); + + this_read = read(fd, filebuf, this_size); + if (this_read <= 0 || this_read > this_size) + return -1; + + for (i = 0; i < this_read; i++) + *csum += filebuf[i]; + + size -= this_read; + } + /* seek back to the start for data segment I/O */ + if (lseek(fd, 0, SEEK_SET) < 0) + return -1; + + return 0; +} + static int cpio_mkfile(const char *name, const char *location, unsigned int mode, uid_t uid, gid_t gid, unsigned int nlinks) { char s[256]; - char *filebuf = NULL; struct stat buf; - long size; + unsigned long size; int file = -1; int retval; int rc = -1; int namesize; unsigned int i; + uint32_t csum = 0; mode |= S_IFREG; @@ -326,29 +353,29 @@ static int cpio_mkfile(const char *name, const char *location, buf.st_mtime = 0xffffffff; } - filebuf = malloc(buf.st_size); - if (!filebuf) { - fprintf (stderr, "out of memory\n"); + if (buf.st_size > 0xffffffff) { + fprintf(stderr, "%s: Size exceeds maximum cpio file size\n", + location); goto error; } - retval = read (file, filebuf, buf.st_size); - if (retval < 0) { - fprintf (stderr, "Can not read %s file\n", location); + if (do_csum && cpio_mkfile_csum(file, buf.st_size, &csum) < 0) { + fprintf(stderr, "Failed to checksum file %s\n", location); goto error; } size = 0; for (i = 1; i <= nlinks; i++) { /* data goes on last link */ - if (i == nlinks) size = buf.st_size; + if (i == nlinks) + size = buf.st_size; if (name[0] == '/') name++; namesize = strlen(name) + 1; sprintf(s,"%s%08X%08X%08lX%08lX%08X%08lX" "%08lX%08X%08X%08X%08X%08X%08X", - "070701", /* magic */ + do_csum ? "070702" : "070701", /* magic */ ino, /* ino */ mode, /* mode */ (long) uid, /* uid */ @@ -361,28 +388,39 @@ static int cpio_mkfile(const char *name, const char *location, 0, /* rmajor */ 0, /* rminor */ namesize, /* namesize */ - 0); /* chksum */ + size ? csum : 0); /* chksum */ push_hdr(s); push_string(name); push_pad(); - if (size) { - if (fwrite(filebuf, size, 1, stdout) != 1) { + while (size) { + unsigned char filebuf[65536]; + ssize_t this_read; + size_t this_size = MIN(size, sizeof(filebuf)); + + this_read = read(file, filebuf, this_size); + if (this_read <= 0 || this_read > this_size) { + fprintf(stderr, "Can not read %s file\n", location); + goto error; + } + + if (fwrite(filebuf, this_read, 1, stdout) != 1) { fprintf(stderr, "writing filebuf failed\n"); goto error; } - offset += size; - push_pad(); + offset += this_read; + size -= this_read; } + push_pad(); name += namesize; } ino++; rc = 0; - + error: - if (filebuf) free(filebuf); - if (file >= 0) close(file); + if (file >= 0) + close(file); return rc; } @@ -458,7 +496,7 @@ static int cpio_mkfile_line(const char *line) static void usage(const char *prog) { fprintf(stderr, "Usage:\n" - "\t%s [-t ] \n" + "\t%s [-t ] [-c] \n" "\n" " is a file containing newline separated entries that\n" "describe the files to be included in the initramfs archive:\n" @@ -493,7 +531,8 @@ static void usage(const char *prog) "\n" " is time in seconds since Epoch that will be used\n" "as mtime for symlinks, special files and directories. The default\n" - "is to use the current time for these entries.\n", + "is to use the current time for these entries.\n" + "-c: calculate and store 32-bit checksums for file data.\n", prog); } @@ -535,7 +574,7 @@ int main (int argc, char *argv[]) default_mtime = time(NULL); while (1) { - int opt = getopt(argc, argv, "t:h"); + int opt = getopt(argc, argv, "t:ch"); char *invalid; if (opt == -1) @@ -550,6 +589,9 @@ int main (int argc, char *argv[]) exit(1); } break; + case 'c': + do_csum = true; + break; case 'h': case '?': usage(argv[0]);