Anton Blanchard e35735b9a5 powerpc: Speed up clear_page by unrolling it
Unroll clear_page 8 times. A simple microbenchmark which
allocates and frees a zeroed page:

for (i = 0; i < iterations; i++) {
	unsigned long p = __get_free_page(GFP_KERNEL | __GFP_ZERO);
	free_page(p);
}

improves 20% on POWER8.

This assumes cacheline sizes won't grow beyond 512 bytes or
page sizes wont drop below 1kB, which is unlikely, but we could
add a runtime check during early init if it makes people nervous.

Michael found that some versions of gcc produce quite bad code
(all multiplies), so we give gcc a hand by using shifts and adds.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2014-10-02 16:04:21 +10:00

188 lines
5.0 KiB
C

#ifndef _ASM_POWERPC_PAGE_64_H
#define _ASM_POWERPC_PAGE_64_H
/*
* Copyright (C) 2001 PPC64 Team, IBM Corp
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/*
* We always define HW_PAGE_SHIFT to 12 as use of 64K pages remains Linux
* specific, every notion of page number shared with the firmware, TCEs,
* iommu, etc... still uses a page size of 4K.
*/
#define HW_PAGE_SHIFT 12
#define HW_PAGE_SIZE (ASM_CONST(1) << HW_PAGE_SHIFT)
#define HW_PAGE_MASK (~(HW_PAGE_SIZE-1))
/*
* PAGE_FACTOR is the number of bits factor between PAGE_SHIFT and
* HW_PAGE_SHIFT, that is 4K pages.
*/
#define PAGE_FACTOR (PAGE_SHIFT - HW_PAGE_SHIFT)
/* Segment size; normal 256M segments */
#define SID_SHIFT 28
#define SID_MASK ASM_CONST(0xfffffffff)
#define ESID_MASK 0xfffffffff0000000UL
#define GET_ESID(x) (((x) >> SID_SHIFT) & SID_MASK)
/* 1T segments */
#define SID_SHIFT_1T 40
#define SID_MASK_1T 0xffffffUL
#define ESID_MASK_1T 0xffffff0000000000UL
#define GET_ESID_1T(x) (((x) >> SID_SHIFT_1T) & SID_MASK_1T)
#ifndef __ASSEMBLY__
#include <asm/cache.h>
typedef unsigned long pte_basic_t;
static inline void clear_page(void *addr)
{
unsigned long iterations;
unsigned long onex, twox, fourx, eightx;
iterations = ppc64_caches.dlines_per_page / 8;
/*
* Some verisions of gcc use multiply instructions to
* calculate the offsets so lets give it a hand to
* do better.
*/
onex = ppc64_caches.dline_size;
twox = onex << 1;
fourx = onex << 2;
eightx = onex << 3;
asm volatile(
"mtctr %1 # clear_page\n\
.balign 16\n\
1: dcbz 0,%0\n\
dcbz %3,%0\n\
dcbz %4,%0\n\
dcbz %5,%0\n\
dcbz %6,%0\n\
dcbz %7,%0\n\
dcbz %8,%0\n\
dcbz %9,%0\n\
add %0,%0,%10\n\
bdnz+ 1b"
: "=&r" (addr)
: "r" (iterations), "0" (addr), "b" (onex), "b" (twox),
"b" (twox+onex), "b" (fourx), "b" (fourx+onex),
"b" (twox+fourx), "b" (eightx-onex), "r" (eightx)
: "ctr", "memory");
}
extern void copy_page(void *to, void *from);
/* Log 2 of page table size */
extern u64 ppc64_pft_size;
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_PPC_MM_SLICES
#define SLICE_LOW_SHIFT 28
#define SLICE_HIGH_SHIFT 40
#define SLICE_LOW_TOP (0x100000000ul)
#define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT)
#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT)
#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT)
#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
/*
* 1 bit per slice and we have one slice per 1TB
* Right now we support only 64TB.
* IF we change this we will have to change the type
* of high_slices
*/
#define SLICE_MASK_SIZE 8
#ifndef __ASSEMBLY__
struct slice_mask {
u16 low_slices;
u64 high_slices;
};
struct mm_struct;
extern unsigned long slice_get_unmapped_area(unsigned long addr,
unsigned long len,
unsigned long flags,
unsigned int psize,
int topdown);
extern unsigned int get_slice_psize(struct mm_struct *mm,
unsigned long addr);
extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize);
extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize);
#define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT)
#endif /* __ASSEMBLY__ */
#else
#define slice_init()
#ifdef CONFIG_PPC_STD_MMU_64
#define get_slice_psize(mm, addr) ((mm)->context.user_psize)
#define slice_set_user_psize(mm, psize) \
do { \
(mm)->context.user_psize = (psize); \
(mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \
} while (0)
#else /* CONFIG_PPC_STD_MMU_64 */
#ifdef CONFIG_PPC_64K_PAGES
#define get_slice_psize(mm, addr) MMU_PAGE_64K
#else /* CONFIG_PPC_64K_PAGES */
#define get_slice_psize(mm, addr) MMU_PAGE_4K
#endif /* !CONFIG_PPC_64K_PAGES */
#define slice_set_user_psize(mm, psize) do { BUG(); } while(0)
#endif /* !CONFIG_PPC_STD_MMU_64 */
#define slice_set_range_psize(mm, start, len, psize) \
slice_set_user_psize((mm), (psize))
#define slice_mm_new_context(mm) 1
#endif /* CONFIG_PPC_MM_SLICES */
#ifdef CONFIG_HUGETLB_PAGE
#ifdef CONFIG_PPC_MM_SLICES
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif
#endif /* !CONFIG_HUGETLB_PAGE */
#define VM_DATA_DEFAULT_FLAGS \
(is_32bit_task() ? \
VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64)
/*
* This is the default if a program doesn't have a PT_GNU_STACK
* program header entry. The PPC64 ELF ABI has a non executable stack
* stack by default, so in the absence of a PT_GNU_STACK program header
* we turn execute permission off.
*/
#define VM_STACK_DEFAULT_FLAGS32 (VM_READ | VM_WRITE | VM_EXEC | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_STACK_DEFAULT_FLAGS64 (VM_READ | VM_WRITE | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_STACK_DEFAULT_FLAGS \
(is_32bit_task() ? \
VM_STACK_DEFAULT_FLAGS32 : VM_STACK_DEFAULT_FLAGS64)
#include <asm-generic/getorder.h>
#endif /* _ASM_POWERPC_PAGE_64_H */