Zhaoxiu Zeng fff7fb0b2d lib/GCD.c: use binary GCD algorithm instead of Euclidean
The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 17:58:30 -07:00

286 lines
7.0 KiB
Plaintext

config METAG
def_bool y
select EMBEDDED
select GENERIC_ATOMIC64
select GENERIC_CLOCKEVENTS
select GENERIC_IRQ_SHOW
select GENERIC_SMP_IDLE_THREAD
select HAVE_64BIT_ALIGNED_ACCESS
select HAVE_ARCH_TRACEHOOK
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DYNAMIC_FTRACE
select HAVE_EXIT_THREAD
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
select HAVE_KERNEL_XZ
select HAVE_MEMBLOCK
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_OPROFILE
select HAVE_PERF_EVENTS
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNDERSCORE_SYMBOL_PREFIX
select IRQ_DOMAIN
select MODULES_USE_ELF_RELA
select OF
select OF_EARLY_FLATTREE
select SPARSE_IRQ
select CPU_NO_EFFICIENT_FFS
config STACKTRACE_SUPPORT
def_bool y
config LOCKDEP_SUPPORT
def_bool y
config RWSEM_GENERIC_SPINLOCK
def_bool y
config RWSEM_XCHGADD_ALGORITHM
bool
config GENERIC_HWEIGHT
def_bool y
config GENERIC_CALIBRATE_DELAY
def_bool y
config NO_IOPORT_MAP
def_bool y
source "init/Kconfig"
source "kernel/Kconfig.freezer"
menu "Processor type and features"
config MMU
def_bool y
config STACK_GROWSUP
def_bool y
config HOTPLUG_CPU
bool "Enable CPU hotplug support"
depends on SMP
help
Say Y here to allow turning CPUs off and on. CPUs can be
controlled through /sys/devices/system/cpu.
Say N if you want to disable CPU hotplug.
config HIGHMEM
bool "High Memory Support"
help
The address space of Meta processors is only 4 Gigabytes large
and it has to accommodate user address space, kernel address
space as well as some memory mapped IO. That means that, if you
have a large amount of physical memory and/or IO, not all of the
memory can be "permanently mapped" by the kernel. The physical
memory that is not permanently mapped is called "high memory".
Depending on the selected kernel/user memory split, minimum
vmalloc space and actual amount of RAM, you may not need this
option which should result in a slightly faster kernel.
If unsure, say n.
source "arch/metag/mm/Kconfig"
source "arch/metag/Kconfig.soc"
config METAG_META12
bool
help
Select this from the SoC config symbol to indicate that it contains a
Meta 1.2 core.
config METAG_META21
bool
help
Select this from the SoC config symbol to indicate that it contains a
Meta 2.1 core.
config SMP
bool "Symmetric multi-processing support"
depends on METAG_META21 && METAG_META21_MMU
help
This enables support for systems with more than one thread running
Linux. If you have a system with only one thread running Linux,
say N. Otherwise, say Y.
config NR_CPUS
int "Maximum number of CPUs (2-4)" if SMP
range 2 4 if SMP
default "1" if !SMP
default "4" if SMP
config METAG_SMP_WRITE_REORDERING
bool
help
This attempts to prevent cache-memory incoherence due to external
reordering of writes from different hardware threads when SMP is
enabled. It adds fences (system event 0) to smp_mb and smp_rmb in an
attempt to catch some of the cases, and also before writes to shared
memory in LOCK1 protected atomics and spinlocks.
This will not completely prevent cache incoherency on affected cores.
config METAG_LNKGET_AROUND_CACHE
bool
depends on METAG_META21
help
This indicates that the LNKGET/LNKSET instructions go around the
cache, which requires some extra cache flushes when the memory needs
to be accessed by normal GET/SET instructions too.
choice
prompt "Atomicity primitive"
default METAG_ATOMICITY_LNKGET
help
This option selects the mechanism for performing atomic operations.
config METAG_ATOMICITY_IRQSOFF
depends on !SMP
bool "irqsoff"
help
This option disables interrupts to achieve atomicity. This mechanism
is not SMP-safe.
config METAG_ATOMICITY_LNKGET
depends on METAG_META21
bool "lnkget/lnkset"
help
This option uses the LNKGET and LNKSET instructions to achieve
atomicity. LNKGET/LNKSET are load-link/store-conditional instructions.
Choose this option if your system requires low latency.
config METAG_ATOMICITY_LOCK1
depends on SMP
bool "lock1"
help
This option uses the LOCK1 instruction for atomicity. This is mainly
provided as a debugging aid if the lnkget/lnkset atomicity primitive
isn't working properly.
endchoice
config METAG_FPU
bool "FPU Support"
depends on METAG_META21
default y
help
This option allows processes to use FPU hardware available with this
CPU. If this option is not enabled FPU registers will not be saved
and restored on context-switch.
If you plan on running programs which are compiled to use hard floats
say Y here.
config METAG_DSP
bool "DSP Support"
help
This option allows processes to use DSP hardware available
with this CPU. If this option is not enabled DSP registers
will not be saved and restored on context-switch.
If you plan on running DSP programs say Y here.
config METAG_PERFCOUNTER_IRQS
bool "PerfCounters interrupt support"
depends on METAG_META21
help
This option enables using interrupts to collect information from
Performance Counters. This option is supported in new META21
(starting from HTP265).
When disabled, Performance Counters information will be collected
based on Timer Interrupt.
config HW_PERF_EVENTS
def_bool METAG_PERFCOUNTER_IRQS && PERF_EVENTS
config METAG_DA
bool "DA support"
help
Say Y if you plan to use a DA debug adapter with Linux. The presence
of the DA will be detected automatically at boot, so it is safe to say
Y to this option even when booting without a DA.
This enables support for services provided by DA JTAG debug adapters,
such as:
- communication over DA channels (such as the console driver).
- use of the DA filesystem.
menu "Boot options"
config METAG_BUILTIN_DTB
bool "Embed DTB in kernel image"
default y
help
Embeds a device tree binary in the kernel image.
config METAG_BUILTIN_DTB_NAME
string "Built in DTB"
depends on METAG_BUILTIN_DTB
help
Set the name of the DTB to embed (leave blank to pick one
automatically based on kernel configuration).
config CMDLINE_BOOL
bool "Default bootloader kernel arguments"
config CMDLINE
string "Kernel command line"
depends on CMDLINE_BOOL
help
On some architectures there is currently no way for the boot loader
to pass arguments to the kernel. For these architectures, you should
supply some command-line options at build time by entering them
here.
config CMDLINE_FORCE
bool "Force default kernel command string"
depends on CMDLINE_BOOL
help
Set this to have arguments from the default kernel command string
override those passed by the boot loader.
endmenu
source "kernel/Kconfig.preempt"
source kernel/Kconfig.hz
endmenu
menu "Power management options"
source kernel/power/Kconfig
endmenu
menu "Executable file formats"
source "fs/Kconfig.binfmt"
endmenu
source "net/Kconfig"
source "drivers/Kconfig"
source "fs/Kconfig"
source "arch/metag/Kconfig.debug"
source "security/Kconfig"
source "crypto/Kconfig"
source "lib/Kconfig"