From 1d78dc59f5ab6f467e49882518453adc7e4caa44 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Sat, 22 Oct 2016 06:19:48 -0700 Subject: [PATCH 01/31] Documentation, ABI: Document the new sysfs files for cpu cache ids Add an ABI document entry for /sys/devices/system/cpu/cpu*/cache/index*/id. Signed-off-by: Tony Luck Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-2-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- .../ABI/testing/sysfs-devices-system-cpu | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 498741737055..2a4a423d08e0 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -272,6 +272,22 @@ Description: Parameters for the CPU cache attributes the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpu*/cache/index*/id +Date: September 2016 +Contact: Linux kernel mailing list +Description: Cache id + + The id provides a unique number for a specific instance of + a cache of a particular type. E.g. there may be a level + 3 unified cache on each socket in a server and we may + assign them ids 0, 1, 2, ... + + Note that id value can be non-contiguous. E.g. level 1 + caches typically exist per core, but there may not be a + power of two cores on a socket, so these caches may be + numbered 0, 1, 2, 3, 4, 5, 8, 9, 10, ... + What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat From e9a2ea5a1ba09c35258f3663842fb8d8cf2e00c2 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:49 -0700 Subject: [PATCH 02/31] cacheinfo: Introduce cache id Cache management software needs an id for each instance of a cache of a particular type. The current cacheinfo structure does not provide any information about the underlying hardware so there is no way to expose it. Hardware with cache management features provides means (cpuid, enumeration etc.) to retrieve the hardware id of a particular cache instance. Cache instances which share hardware have the same hardware id. Add an 'id' field to struct cacheinfo to store this information. Expose this information under the /sys/devices/system/cpu/cpu*/cache/index*/ directory as well. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-3-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- drivers/base/cacheinfo.c | 5 +++++ include/linux/cacheinfo.h | 3 +++ 2 files changed, 8 insertions(+) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index e9fd32e91668..00a9688043f4 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -233,6 +233,7 @@ static ssize_t file_name##_show(struct device *dev, \ return sprintf(buf, "%u\n", this_leaf->object); \ } +show_one(id, id); show_one(level, level); show_one(coherency_line_size, coherency_line_size); show_one(number_of_sets, number_of_sets); @@ -314,6 +315,7 @@ static ssize_t write_policy_show(struct device *dev, return n; } +static DEVICE_ATTR_RO(id); static DEVICE_ATTR_RO(level); static DEVICE_ATTR_RO(type); static DEVICE_ATTR_RO(coherency_line_size); @@ -327,6 +329,7 @@ static DEVICE_ATTR_RO(shared_cpu_list); static DEVICE_ATTR_RO(physical_line_partition); static struct attribute *cache_default_attrs[] = { + &dev_attr_id.attr, &dev_attr_type.attr, &dev_attr_level.attr, &dev_attr_shared_cpu_map.attr, @@ -350,6 +353,8 @@ cache_default_attrs_is_visible(struct kobject *kobj, const struct cpumask *mask = &this_leaf->shared_cpu_map; umode_t mode = attr->mode; + if ((attr == &dev_attr_id.attr) && (this_leaf->attributes & CACHE_ID)) + return mode; if ((attr == &dev_attr_type.attr) && this_leaf->type) return mode; if ((attr == &dev_attr_level.attr) && this_leaf->level) diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 2189935075b4..0bcbb674da9d 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -18,6 +18,7 @@ enum cache_type { /** * struct cacheinfo - represent a cache leaf node + * @id: This cache's id. It is unique among caches with the same (type, level). * @type: type of the cache - data, inst or unified * @level: represents the hierarchy in the multi-level cache * @coherency_line_size: size of each cache line usually representing @@ -44,6 +45,7 @@ enum cache_type { * keeping, the remaining members form the core properties of the cache */ struct cacheinfo { + unsigned int id; enum cache_type type; unsigned int level; unsigned int coherency_line_size; @@ -61,6 +63,7 @@ struct cacheinfo { #define CACHE_WRITE_ALLOCATE BIT(3) #define CACHE_ALLOCATE_POLICY_MASK \ (CACHE_READ_ALLOCATE | CACHE_WRITE_ALLOCATE) +#define CACHE_ID BIT(4) struct device_node *of_node; bool disable_sysfs; From d57e3ab7e34c51a8badeea1b500bfb738d0af66e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:50 -0700 Subject: [PATCH 03/31] x86/intel_cacheinfo: Enable cache id in cache info Cache id is retrieved from APIC ID and CPUID leaf 4 on x86. For more details please see the section on "Cache ID Extraction Parameters" in "Intel 64 Architecture Processor Topology Enumeration". Also the documentation of the CPUID instruction in the "Intel 64 and IA-32 Architectures Software Developer's Manual" Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-4-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_cacheinfo.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..8dc572085fb4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -153,6 +153,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; + unsigned int id; unsigned long size; struct amd_northbridge *nb; }; @@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, static void ci_leaf_init(struct cacheinfo *this_leaf, struct _cpuid4_info_regs *base) { + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; this_leaf->level = base->eax.split.level; this_leaf->type = cache_type_map[base->eax.split.type]; this_leaf->coherency_line_size = @@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu) return 0; } +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + static int __populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; @@ -931,6 +950,7 @@ static int __populate_cache_leaves(unsigned int cpu) ret = cpuid4_cache_lookup_regs(idx, &id4_regs); if (ret) return ret; + get_cache_id(cpu, &id4_regs); ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } From 4ab1586488cb56ed8728e54c4157cc38646874d9 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:51 -0700 Subject: [PATCH 04/31] x86/cpufeature: Add RDT CPUID feature bits Check CPUID leaves for all the Resource Director Technology (RDT) Cache Allocation Technology (CAT) bits. Presence of allocation features: CPUID.(EAX=7H, ECX=0):EBX[bit 15] X86_FEATURE_RDT_A L2 and L3 caches are each separately enabled: CPUID.(EAX=10H, ECX=0):EBX[bit 1] X86_FEATURE_CAT_L3 CPUID.(EAX=10H, ECX=0):EBX[bit 2] X86_FEATURE_CAT_L2 L3 cache may support independent control of allocation for code and data (CDP = Code/Data Prioritization): CPUID.(EAX=10H, ECX=1):ECX[bit 2] X86_FEATURE_CDP_L3 [ tglx: Fixed up Borislavs comments and moved the feature bits into a gap ] Signed-off-by: Fenghua Yu Acked-by: "Borislav Petkov" Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-5-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 4 ++++ arch/x86/kernel/cpu/scattered.c | 3 +++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a39629206864..90b8c0b185c3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -189,6 +189,9 @@ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ @@ -221,6 +224,7 @@ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 1db8dc490b66..49fb680bb0e5 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -36,6 +36,9 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CAT_L3, CR_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CR_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CR_ECX, 2, 0x00000010, 1 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, From 78e99b4a2b9afb1c304259fcd4a1c71ca97e3acd Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:53 -0700 Subject: [PATCH 05/31] x86/intel_rdt: Add CONFIG, Makefile, and basic initialization Introduce CONFIG_INTEL_RDT_A (default: no, dependent on CPU_SUP_INTEL) to control inclusion of Resource Director Technology in the build. Simple init() routine just checks which features are present. If they are pr_info() one line summary for each feature for now. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-7-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 12 ++++++++ arch/x86/kernel/cpu/Makefile | 2 ++ arch/x86/kernel/cpu/intel_rdt.c | 54 +++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 arch/x86/kernel/cpu/intel_rdt.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..770fb5f23cea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -407,6 +407,18 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config INTEL_RDT_A + bool "Intel Resource Director Technology Allocation support" + default n + depends on X86 && CPU_SUP_INTEL + help + Select to enable resource allocation which is a sub-feature of + Intel Resource Director Technology(RDT). More information about + RDT can be found in the Intel x86 Architecture Software + Developer Manual. + + Say N if unsure. + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4a8697f7d4ef..cf4bfd030c0c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -34,6 +34,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c new file mode 100644 index 000000000000..7d7aebeaac76 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -0,0 +1,54 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * Vikas Shivappa + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +static inline bool get_rdt_resources(void) +{ + bool ret = false; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + if (boot_cpu_has(X86_FEATURE_CAT_L3)) + ret = true; + + return ret; +} + +static int __init intel_rdt_late_init(void) +{ + if (!get_rdt_resources()) + return -ENODEV; + + pr_info("Intel RDT cache allocation detected\n"); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) + pr_info("Intel RDT code data prioritization detected\n"); + + return 0; +} + +late_initcall(intel_rdt_late_init); From 113c60970cf41723891e3a1b303517eaf8510bb5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:54 -0700 Subject: [PATCH 06/31] x86/intel_rdt: Add Haswell feature discovery Some Haswell generation CPUs support RDT, but they don't enumerate this via CPUID. Use rdmsr_safe() and wrmsr_safe() to probe the MSRs on cpu model 63 (INTEL_FAM6_HASWELL_X) Move the relevant defines into a common header file which is shared between RDT/CQM and RDT/Allocation to avoid duplication. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-8-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/cqm.c | 2 +- arch/x86/include/asm/intel_rdt.h | 6 +++ arch/x86/include/asm/intel_rdt_common.h | 6 +++ arch/x86/kernel/cpu/intel_rdt.c | 49 +++++++++++++++++++++++-- 4 files changed, 58 insertions(+), 5 deletions(-) create mode 100644 arch/x86/include/asm/intel_rdt.h create mode 100644 arch/x86/include/asm/intel_rdt_common.h diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8f82b02934fa..df868745ad2b 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -7,9 +7,9 @@ #include #include #include +#include #include "../perf_event.h" -#define MSR_IA32_PQR_ASSOC 0x0c8f #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h new file mode 100644 index 000000000000..3aca86ddfa43 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#define IA32_L3_CBM_BASE 0xc90 + +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h new file mode 100644 index 000000000000..e6e15cfd083f --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_INTEL_RDT_COMMON_H +#define _ASM_X86_INTEL_RDT_COMMON_H + +#define MSR_IA32_PQR_ASSOC 0x0c8f + +#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 7d7aebeaac76..f8e35cf28792 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -27,16 +27,57 @@ #include #include +#include +#include +#include + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cach mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline bool cache_alloc_hsw_probe(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return false; + rdmsr(IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + return l == max_cbm; + } + + return false; +} + static inline bool get_rdt_resources(void) { - bool ret = false; + if (cache_alloc_hsw_probe()) + return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) - ret = true; + if (!boot_cpu_has(X86_FEATURE_CAT_L3)) + return false; - return ret; + return true; } static int __init intel_rdt_late_init(void) From c1c7c3f9d6bb6999a45f66ea4c6bfbcab87ff34b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:55 -0700 Subject: [PATCH 07/31] x86/intel_rdt: Pick up L3/L2 RDT parameters from CPUID Define struct rdt_resource to hold all the parameterized values for an RDT resource and fill in the CPUID enumerated values from leaf 0x10 if available. Hard code them for the MSR detected Haswells. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-9-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 68 +++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt.c | 111 +++++++++++++++++++++++++++++-- 2 files changed, 172 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 3aca86ddfa43..9780409d2502 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -2,5 +2,73 @@ #define _ASM_X86_INTEL_RDT_H #define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @max_cbm: Largest Cache Bit Mask allowed + * @min_cbm_bits: Minimum number of consecutive bits to be set + * in a cache bit mask + * @domains: All domains for this resource + * @num_domains: Number of domains active + * @msr_base: Base MSR address for CBMs + * @tmp_cbms: Scratch space when updating schemata + * @cache_level: Which cache level defines scope of this domain + * @cbm_idx_multi: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + u32 max_cbm; + struct list_head domains; + int num_domains; + int msr_base; + u32 *tmp_cbms; + int cache_level; + int cbm_idx_multi; + int cbm_idx_offset; +}; + +extern struct rdt_resource rdt_resources_all[]; + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->capable) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ +union cpuid_0x10_1_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index f8e35cf28792..157dc8d0df08 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -31,6 +31,47 @@ #include #include +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) + +struct rdt_resource rdt_resources_all[] = { + { + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 0 + }, + { + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 1 + }, + { + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 2, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, +}; + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -54,6 +95,7 @@ static inline bool cache_alloc_hsw_probe(void) if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; u32 l, h, max_cbm = BIT_MASK(20) - 1; if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) @@ -61,33 +103,88 @@ static inline bool cache_alloc_hsw_probe(void) rdmsr(IA32_L3_CBM_BASE, l, h); /* If all the bits were set in MSR, return success */ - return l == max_cbm; + if (l != max_cbm) + return false; + + r->num_closid = 4; + r->cbm_len = 20; + r->max_cbm = max_cbm; + r->min_cbm_bits = 2; + r->capable = true; + r->enabled = true; + + return true; } return false; } +static void rdt_get_config(int idx, struct rdt_resource *r) +{ + union cpuid_0x10_1_eax eax; + union cpuid_0x10_1_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->cbm_len = eax.split.cbm_len + 1; + r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->capable = true; + r->enabled = true; +} + +static void rdt_get_cdp_l3_config(int type) +{ + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &rdt_resources_all[type]; + + r->num_closid = r_l3->num_closid / 2; + r->cbm_len = r_l3->cbm_len; + r->max_cbm = r_l3->max_cbm; + r->capable = true; + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + r->enabled = false; +} + static inline bool get_rdt_resources(void) { + bool ret = false; + if (cache_alloc_hsw_probe()) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (!boot_cpu_has(X86_FEATURE_CAT_L3)) - return false; - return true; + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + return ret; } static int __init intel_rdt_late_init(void) { + struct rdt_resource *r; + if (!get_rdt_resources()) return -ENODEV; - pr_info("Intel RDT cache allocation detected\n"); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) - pr_info("Intel RDT code data prioritization detected\n"); + for_each_capable_rdt_resource(r) + pr_info("Intel RDT %s allocation detected\n", r->name); return 0; } From 6b281569df649ed76145c527028fbbe8a32493aa Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Sat, 22 Oct 2016 06:19:56 -0700 Subject: [PATCH 08/31] x86/cqm: Share PQR_ASSOC related data between CQM and CAT PQR_ASSOC MSR contains the RMID used for preformance monitoring of cache occupancy and memory bandwidth. The upper 32bit of this MSR contain the CLOSID for cache allocation. So we need to share the information between the two facilities. Move the rdt data structure declaration into the shared header file and make the per cpu data structure containing the MSR values global. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "David Carrillo-Cisneros" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "Shaohua Li" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477142405-32078-10-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/cqm.c | 21 +-------------------- arch/x86/include/asm/intel_rdt_common.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index df868745ad2b..0c45cc8e64ba 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; unsigned int mbm_socket_max; -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - /* * The cached intel_pqr_state is strictly per CPU and can never be * updated from a remote CPU. Both functions which modify the state * (intel_cqm_event_start and intel_cqm_event_stop) are called with * interrupts disabled, which is sufficient for the protection. */ -static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h index e6e15cfd083f..b31081b89407 100644 --- a/arch/x86/include/asm/intel_rdt_common.h +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -3,4 +3,25 @@ #define MSR_IA32_PQR_ASSOC 0x0c8f +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 rmid; + u32 closid; + int rmid_usecnt; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + #endif /* _ASM_X86_INTEL_RDT_COMMON_H */ From f20e57892806ad244eaec7a7ae365e78fee53377 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:40 -0700 Subject: [PATCH 09/31] Documentation, x86: Documentation for Intel resource allocation user interface The documentation describes user interface of how to allocate resource in Intel RDT. Please note that the documentation covers generic user interface. Current patch set code only implemente CAT L3. CAT L2 code will be sent later. [ tglx: Added cpu example ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-2-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- Documentation/x86/intel_rdt_ui.txt | 195 +++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 Documentation/x86/intel_rdt_ui.txt diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt new file mode 100644 index 000000000000..3b0ebd4cf423 --- /dev/null +++ b/Documentation/x86/intel_rdt_ui.txt @@ -0,0 +1,195 @@ +User Interface for Resource Allocation in Intel Resource Director Technology + +Copyright (C) 2016 Intel Corporation + +Fenghua Yu +Tony Luck + +This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the +X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". + +To use the feature mount the file system: + + # mount -t resctrl resctrl [-o cdp] /sys/fs/resctrl + +mount options are: + +"cdp": Enable code/data prioritization in L3 cache allocations. + + +Resource groups +--------------- +Resource groups are represented as directories in the resctrl file +system. The default group is the root directory. Other groups may be +created as desired by the system administrator using the "mkdir(1)" +command, and removed using "rmdir(1)". + +There are three files associated with each group: + +"tasks": A list of tasks that belongs to this group. Tasks can be + added to a group by writing the task ID to the "tasks" file + (which will automatically remove them from the previous + group to which they belonged). New tasks created by fork(2) + and clone(2) are added to the same group as their parent. + If a pid is not in any sub partition, it is in root partition + (i.e. default partition). + +"cpus": A bitmask of logical CPUs assigned to this group. Writing + a new mask can add/remove CPUs from this group. Added CPUs + are removed from their previous group. Removed ones are + given to the default (root) group. You cannot remove CPUs + from the default group. + +"schemata": A list of all the resources available to this group. + Each resource has its own line and format - see below for + details. + +When a task is running the following rules define which resources +are available to it: + +1) If the task is a member of a non-default group, then the schemata +for that group is used. + +2) Else if the task belongs to the default group, but is running on a +CPU that is assigned to some specific group, then the schemata for +the CPU's group is used. + +3) Otherwise the schemata for the default group is used. + + +Schemata files - general concepts +--------------------------------- +Each line in the file describes one resource. The line starts with +the name of the resource, followed by specific values to be applied +in each of the instances of that resource on the system. + +Cache IDs +--------- +On current generation systems there is one L3 cache per socket and L2 +caches are generally just shared by the hyperthreads on a core, but this +isn't an architectural requirement. We could have multiple separate L3 +caches on a socket, multiple cores could share an L2 cache. So instead +of using "socket" or "core" to define the set of logical cpus sharing +a resource we use a "Cache ID". At a given cache level this will be a +unique number across the whole system (but it isn't guaranteed to be a +contiguous sequence, there may be gaps). To find the ID for each logical +CPU look in /sys/devices/system/cpu/cpu*/cache/index*/id + +Cache Bit Masks (CBM) +--------------------- +For cache resources we describe the portion of the cache that is available +for allocation using a bitmask. The maximum value of the mask is defined +by each cpu model (and may be different for different cache levels). It +is found using CPUID, but is also provided in the "info" directory of +the resctrl file system in "info/{resource}/cbm_mask". X86 hardware +requires that these masks have all the '1' bits in a contiguous block. So +0x3, 0x6 and 0xC are legal 4-bit masks with two bits set, but 0x5, 0x9 +and 0xA are not. On a system with a 20-bit mask each bit represents 5% +of the capacity of the cache. You could partition the cache into four +equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. + + +L3 details (code and data prioritization disabled) +-------------------------------------------------- +With CDP disabled the L3 schemata format is: + + L3:=;=;... + +L3 details (CDP enabled via mount option to resctrl) +---------------------------------------------------- +When CDP is enabled L3 control is split into two separate resources +so you can specify independent masks for code and data like this: + + L3data:=;=;... + L3code:=;=;... + +L2 details +---------- +L2 cache does not support code and data prioritization, so the +schemata format is always: + + L2:=;=;... + +Example 1 +--------- +On a two socket machine (one L3 cache per socket) with just four bits +for cache bit masks + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl +# mkdir p0 p1 +# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata +# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata + +The default resource group is unmodified, so we have access to all parts +of all caches (its schemata file reads "L3:0=f;1=f"). + +Tasks that are under the control of group "p0" may only allocate from the +"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1. +Tasks in group "p1" use the "lower" 50% of cache on both sockets. + +Example 2 +--------- +Again two sockets, but this time with a more realistic 20-bit mask. + +Two real time tasks pid=1234 running on processor 0 and pid=5678 running on +processor 1 on socket 0 on a 2-socket and dual core machine. To avoid noisy +neighbors, each of the two real-time tasks exclusively occupies one quarter +of L3 cache on socket 0. + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl + +First we reset the schemata for the default group so that the "upper" +50% of the L3 cache on socket 0 cannot be used by ordinary tasks: + +# echo "L3:0=3ff;1=fffff" > schemata + +Next we make a resource group for our first real time task and give +it access to the "top" 25% of the cache on socket 0. + +# mkdir p0 +# echo "L3:0=f8000;1=fffff" > p0/schemata + +Finally we move our first real time task into this resource group. We +also use taskset(1) to ensure the task always runs on a dedicated CPU +on socket 0. Most uses of resource groups will also constrain which +processors tasks run on. + +# echo 1234 > p0/tasks +# taskset -cp 1 1234 + +Ditto for the second real time task (with the remaining 25% of cache): + +# mkdir p1 +# echo "L3:0=7c00;1=fffff" > p1/schemata +# echo 5678 > p1/tasks +# taskset -cp 2 5678 + +Example 3 +--------- + +A single socket system which has real-time tasks running on core 4-7 and +non real-time workload assigned to core 0-3. The real-time tasks share text +and data, so a per task association is not required and due to interaction +with the kernel it's desired that the kernel on these cores shares L3 with +the tasks. + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl + +First we reset the schemata for the default group so that the "upper" +50% of the L3 cache on socket 0 cannot be used by ordinary tasks: + +# echo "L3:0=3ff" > schemata + +Next we make a resource group for our real time cores and give +it access to the "top" 50% of the cache on socket 0. + +# mkdir p0 +# echo "L3:0=ffc00;" > p0/schemata + +Finally we move core 4-7 over to the new group and make sure that the +kernel and the tasks running there get 50% of the cache. + +# echo C0 > p0/cpus From 2264d9c74dda1b6835ab7858204073547457dfd0 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Oct 2016 15:04:41 -0700 Subject: [PATCH 10/31] x86/intel_rdt: Build structures for each resource based on cache topology We use the cpu hotplug notifier to catch each cpu in turn and look at its cache topology w.r.t each of the resource groups. As we discover new resources, we initialize the bitmask array for each to the default (full access) value. Signed-off-by: Tony Luck Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-3-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 35 ++++++ arch/x86/kernel/cpu/intel_rdt.c | 191 ++++++++++++++++++++++++++++++- 2 files changed, 225 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 9780409d2502..c0d0a6e6448c 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -39,6 +39,34 @@ struct rdt_resource { int cbm_idx_offset; }; +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @cbm: array of cache bit masks (indexed by CLOSID) + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + u32 *cbm; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +extern struct mutex rdtgroup_mutex; + extern struct rdt_resource rdt_resources_all[]; enum { @@ -56,6 +84,11 @@ enum { r++) \ if (r->capable) +#define for_each_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->enabled) + /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -71,4 +104,6 @@ union cpuid_0x10_1_edx { } split; unsigned int full; }; + +void rdt_cbm_update(void *arg); #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 157dc8d0df08..3d4b397a1181 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -26,11 +26,16 @@ #include #include +#include +#include #include #include #include +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { @@ -72,6 +77,11 @@ struct rdt_resource rdt_resources_all[] = { }, }; +static int cbm_idx(struct rdt_resource *r, int closid) +{ + return closid * r->cbm_idx_multi + r->cbm_idx_offset; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -176,13 +186,192 @@ static inline bool get_rdt_resources(void) return ret; } -static int __init intel_rdt_late_init(void) +static int get_cache_id(int cpu, int level) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) + return ci->info_list[i].id; + } + + return -1; +} + +void rdt_cbm_update(void *arg) +{ + struct msr_param *m = (struct msr_param *)arg; + struct rdt_resource *r = m->res; + int i, cpu = smp_processor_id(); + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + goto found; + } + pr_info_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); + + return; + +found: + for (i = m->low; i < m->high; i++) { + int idx = cbm_idx(r, i); + + wrmsrl(r->msr_base + idx, d->cbm[i]); + } +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(id); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int i, id = get_cache_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + return; + } + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->id = id; + + d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); + if (!d->cbm) { + kfree(d); + return; + } + + for (i = 0; i < r->num_closid; i++) { + int idx = cbm_idx(r, i); + + d->cbm[i] = r->max_cbm; + wrmsrl(r->msr_base + idx, d->cbm[i]); + } + + cpumask_set_cpu(cpu, &d->cpu_mask); + list_add_tail(&d->list, add_pos); + r->num_domains++; +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cache_id(cpu, r->cache_level); + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + r->num_domains--; + kfree(d->cbm); + list_del(&d->list); + kfree(d); + } +} + +static int intel_rdt_online_cpu(unsigned int cpu) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdt_resource *r; + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int __init intel_rdt_late_init(void) +{ + struct rdt_resource *r; + int state; + if (!get_rdt_resources()) return -ENODEV; + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/rdt/cat:online:", + intel_rdt_online_cpu, intel_rdt_offline_cpu); + if (state < 0) + return state; + for_each_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); From 5ff193fbde20df5d80fec367cea3e7856c057320 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:42 -0700 Subject: [PATCH 11/31] x86/intel_rdt: Add basic resctrl filesystem support Use kernfs as basis for our user interface filesystem. This patch supports mount/umount, and one mount parameter "cdp" to enable code/data prioritization (though all we do at this point is ensure that the system can support CDP). The file system is not populated yet in this patch. [ tglx: Fixed up a few nits and added cdp handling in case of error ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-4-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 26 +++ arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/intel_rdt.c | 8 +- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 271 +++++++++++++++++++++++ include/uapi/linux/magic.h | 1 + 5 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index c0d0a6e6448c..09d00e6f7ad6 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -1,9 +1,31 @@ #ifndef _ASM_X86_INTEL_RDT_H #define _ASM_X86_INTEL_RDT_H +#include + +#define IA32_L3_QOS_CFG 0xc81 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 +#define L3_QOS_CDP_ENABLE 0x01ULL + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + int closid; +}; + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +int __init rdtgroup_init(void); + /** * struct rdt_resource - attributes of an RDT resource * @enabled: Is this feature enabled on this machine @@ -68,6 +90,10 @@ struct msr_param { extern struct mutex rdtgroup_mutex; extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +int __init rdtgroup_init(void); enum { RDT_RESOURCE_L3, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cf4bfd030c0c..b4334e86c1a9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -34,7 +34,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 3d4b397a1181..9d95414f7117 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -361,7 +361,7 @@ static int intel_rdt_offline_cpu(unsigned int cpu) static int __init intel_rdt_late_init(void) { struct rdt_resource *r; - int state; + int state, ret; if (!get_rdt_resources()) return -ENODEV; @@ -372,6 +372,12 @@ static int __init intel_rdt_late_init(void) if (state < 0) return state; + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + for_each_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c new file mode 100644 index 000000000000..106e4ce8f7e0 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -0,0 +1,271 @@ +/* + * User interface for Resource Alloction in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include + +#include + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +{ + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + list_for_each_entry(d, &r->domains, list) { + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + cpu = get_cpu(); + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + l3_qos_cfg_update(&enable); + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +static int cdp_enable(void) +{ + struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + int ret; + + if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + return -EINVAL; + + ret = set_l3_qos_cfg(r_l3, true); + if (!ret) { + r_l3->enabled = false; + r_l3data->enabled = true; + r_l3code->enabled = true; + } + return ret; +} + +static void cdp_disable(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + + r->enabled = r->capable; + + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + set_l3_qos_cfg(r, false); + } +} + +static int parse_rdtgroupfs_options(char *data) +{ + char *token, *o = data; + int ret = 0; + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + + if (!strcmp(token, "cdp")) + ret = cdp_enable(); + } + + return ret; +} + +static struct dentry *rdt_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct dentry *dentry; + int ret; + + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + dentry = ERR_PTR(-EBUSY); + goto out; + } + + ret = parse_rdtgroupfs_options(data); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + dentry = kernfs_mount(fs_type, flags, rdt_root, + RDTGROUP_SUPER_MAGIC, NULL); + if (IS_ERR(dentry)) + goto out_cdp; + + static_branch_enable(&rdt_enable_key); + goto out; + +out_cdp: + cdp_disable(); +out: + mutex_unlock(&rdtgroup_mutex); + + return dentry; +} + +static int reset_all_cbms(struct rdt_resource *r) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i, cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = r->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < r->num_closid; i++) + d->cbm[i] = r->max_cbm; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + + /*Put everything back to default values. */ + for_each_enabled_rdt_resource(r) + reset_all_cbms(r); + cdp_disable(); + static_branch_disable(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .mount = rdt_mount, + .kill_sb = rdt_kill_sb, +}; + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { +}; + +static int __init rdtgroup_setup_root(void) +{ + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + rdtgroup_default.kn = rdt_root->kn; + kernfs_activate(rdtgroup_default.kn); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 9bd559472c92..e230af2e6855 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -57,6 +57,7 @@ #define CGROUP_SUPER_MAGIC 0x27e0eb #define CGROUP2_SUPER_MAGIC 0x63677270 +#define RDTGROUP_SUPER_MAGIC 0x7655821 #define STACK_END_MAGIC 0x57AC6E9D From 4e978d06dedb8207b298a5a8a49fce4b2ab80d12 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:43 -0700 Subject: [PATCH 12/31] x86/intel_rdt: Add "info" files to resctrl file system For the convenience of applications we make the decoded values of some of the CPUID values available in read-only (0444) files. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-5-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 24 +++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 185 +++++++++++++++++++++++ 2 files changed, 209 insertions(+) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 09d00e6f7ad6..5b7b3f6f1f96 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -26,6 +26,30 @@ extern struct list_head rdt_all_groups; int __init rdtgroup_init(void); +/** + * struct rftype - describe each file in the resctrl file system + * @name: file name + * @mode: access mode + * @kf_ops: operations + * @seq_show: show content of the file + * @write: write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + /** * struct rdt_resource - attributes of an RDT resource * @enabled: Is this feature enabled on this machine diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 106e4ce8f7e0..fbb42e79b10a 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include #include @@ -34,6 +36,176 @@ struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, + int len) +{ + struct rftype *rft; + int ret; + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) + kernfs_remove_by_name(kn, rft->name); + return ret; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_closid); + + return 0; +} + +static int rdt_cbm_mask_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->max_cbm); + + return 0; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_cbm_mask_show, + }, +}; + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct kernfs_node *kn_subdir; + struct rdt_resource *r; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + kernfs_get(kn_info); + + for_each_enabled_rdt_resource(r) { + kn_subdir = kernfs_create_dir(kn_info, r->name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) { + ret = PTR_ERR(kn_subdir); + goto out_destroy; + } + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + goto out_destroy; + ret = rdtgroup_add_files(kn_subdir, res_info_files, + ARRAY_SIZE(res_info_files)); + if (ret) + goto out_destroy; + kernfs_activate(kn_subdir); + } + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn_info); + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -137,6 +309,10 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret) + goto out_cdp; + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) @@ -191,6 +367,14 @@ static int reset_all_cbms(struct rdt_resource *r) return 0; } +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + kernfs_remove(kn_info); +} + static void rdt_kill_sb(struct super_block *sb) { struct rdt_resource *r; @@ -201,6 +385,7 @@ static void rdt_kill_sb(struct super_block *sb) for_each_enabled_rdt_resource(r) reset_all_cbms(r); cdp_disable(); + rmdir_all_sub(); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); From 60cf5e101fd4441ab112a81e88726efb6fd7542c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:44 -0700 Subject: [PATCH 13/31] x86/intel_rdt: Add mkdir to resctrl file system Resource control groups are represented as directories in the resctrl file system. The root directory describes the default resources available to tasks that have not been assigned specific resources. Other directories can be created at the root level to make new resource groups. It is not permitted to make directories within other directories. Hardware uses a CLOSID (Class of service ID) to determine which resource limits are currently in effect. The exact number available is enumerated by CPUID leaf 0x10, but on current implementations it is a small number. We implement a simple bitmask allocator for CLOSIDs. Each resource control group uses one CLOSID, which limits the total number of directories that can be created. Resource groups can be removed using rmdir. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-6-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 9 + arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 231 +++++++++++++++++++++++ 2 files changed, 240 insertions(+) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 5b7b3f6f1f96..8032ace7c0fd 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -14,13 +14,20 @@ * @kn: kernfs node * @rdtgroup_list: linked list for all rdtgroups * @closid: closid for this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this */ struct rdtgroup { struct kernfs_node *kn; struct list_head rdtgroup_list; int closid; + int flags; + atomic_t waitcount; }; +/* rdtgroup.flags */ +#define RDT_DELETED 1 + /* List of all resource groups */ extern struct list_head rdt_all_groups; @@ -156,4 +163,6 @@ union cpuid_0x10_1_edx { }; void rdt_cbm_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fbb42e79b10a..85d31eacabe0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -26,10 +26,12 @@ #include #include #include +#include #include #include +#include DEFINE_STATIC_KEY_FALSE(rdt_enable_key); struct kernfs_root *rdt_root; @@ -39,6 +41,55 @@ LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; + +static void closid_init(void) +{ + struct rdt_resource *r; + int rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + for_each_enabled_rdt_resource(r) + rdt_min_closid = min(rdt_min_closid, r->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; +} + +int closid_alloc(void) +{ + int closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +static void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + /* set uid and gid of rdtgroup dirs and files to that of the creator */ static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) { @@ -287,6 +338,54 @@ static int parse_rdtgroupfs_options(char *data) return ret; } +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) + return kn->priv; + else + return kn->parent->priv; +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + mutex_unlock(&rdtgroup_mutex); + + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + kernfs_unbreak_active_protection(kn); + kernfs_put(kn); + kfree(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) @@ -309,6 +408,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, goto out_cdp; } + closid_init(); + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); if (ret) goto out_cdp; @@ -367,11 +468,40 @@ static int reset_all_cbms(struct rdt_resource *r) return 0; } +/* + * MSR_IA32_PQR_ASSOC is scoped per logical CPU, so all updates + * are always in thread context. + */ +static void rdt_reset_pqr_assoc_closid(void *v) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); +} + /* * Forcibly remove all of subdirectories under root. */ static void rmdir_all_sub(void) { + struct rdtgroup *rdtgrp, *tmp; + + get_cpu(); + /* Reset PQR_ASSOC MSR on this cpu. */ + rdt_reset_pqr_assoc_closid(NULL); + /* Reset PQR_ASSOC MSR on the rest of cpus. */ + smp_call_function_many(cpu_online_mask, rdt_reset_pqr_assoc_closid, + NULL, 1); + put_cpu(); + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); + } kernfs_remove(kn_info); } @@ -397,7 +527,108 @@ static struct file_system_type rdt_fs_type = { .kill_sb = rdt_kill_sb, }; +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct rdtgroup *parent, *rdtgrp; + struct kernfs_node *kn; + int ret, closid; + + /* Only allow mkdir in the root directory */ + if (parent_kn != rdtgroup_default.kn) + return -EPERM; + + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = rdtgroup_kn_lock_live(parent_kn); + if (!parent) { + ret = -ENODEV; + goto out_unlock; + } + + ret = closid_alloc(); + if (ret < 0) + goto out_unlock; + closid = ret; + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + goto out_closid_free; + } + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_cancel_ref; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn} call below. Take one extra reference + * here, which will be dropped inside rdtgroup_kn_unlock(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + ret = 0; + goto out_unlock; + +out_destroy: + kernfs_remove(rdtgrp->kn); +out_cancel_ref: + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); +out_closid_free: + closid_free(closid); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(kn); + return -ENOENT; + } + + rdtgrp->flags = RDT_DELETED; + closid_free(rdtgrp->closid); + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + rdtgroup_kn_unlock(kn); + + return ret; +} + static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, }; static int __init rdtgroup_setup_root(void) From 12e0110c11a460b890ed7e1071198ced732152c9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Oct 2016 15:04:45 -0700 Subject: [PATCH 14/31] x86/intel_rdt: Add cpus file Now we populate each directory with a read/write (mode 0644) file named "cpus". This is used to over-ride the resources available to processes in the default resource group when running on specific CPUs. Each "cpus" file reads as a cpumask showing which CPUs belong to this resource group. Initially all online CPUs are assigned to the default group. They can be added to other groups by writing a cpumask to the "cpus" file in the directory for the resource group (which will remove them from the previous group to which they were assigned). CPU online/offline operations will delete CPUs that go offline from whatever group they are in and add new CPUs to the default group. If there are CPUs assigned to a group when the directory is removed, they are returned to the default group. Signed-off-by: Tony Luck Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-7-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 5 + arch/x86/kernel/cpu/intel_rdt.c | 23 +++- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 131 ++++++++++++++++++++++- 3 files changed, 154 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 8032ace7c0fd..a0dd3e99038d 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -14,13 +14,16 @@ * @kn: kernfs node * @rdtgroup_list: linked list for all rdtgroups * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup * @flags: status bits * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex */ struct rdtgroup { struct kernfs_node *kn; struct list_head rdtgroup_list; int closid; + struct cpumask cpu_mask; int flags; atomic_t waitcount; }; @@ -162,6 +165,8 @@ union cpuid_0x10_1_edx { unsigned int full; }; +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); + void rdt_cbm_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 9d95414f7117..40094aed5f71 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -36,6 +36,8 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { @@ -331,16 +333,25 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) } } -static int intel_rdt_online_cpu(unsigned int cpu) +static void clear_closid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + + per_cpu(cpu_closid, cpu) = 0; + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); +} + +static int intel_rdt_online_cpu(unsigned int cpu) +{ struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -348,11 +359,17 @@ static int intel_rdt_online_cpu(unsigned int cpu) static int intel_rdt_offline_cpu(unsigned int cpu) { + struct rdtgroup *rdtgrp; struct rdt_resource *r; mutex_lock(&rdtgroup_mutex); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + break; + } + clear_closid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 85d31eacabe0..e05a18685fc8 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -20,6 +20,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -172,6 +173,111 @@ static struct kernfs_ops rdtgroup_kf_single_ops = { .seq_show = rdtgroup_seqfile_show, }; +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask; + struct rdtgroup *rdtgrp, *r; + int ret, cpu; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + ret = cpumask_parse(buf, newmask); + if (ret) + goto unlock; + + get_online_cpus(); + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (cpumask_weight(tmpmask)) { + ret = -EINVAL; + goto end; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + ret = -EINVAL; + goto end; + } + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + for_each_cpu(cpu, tmpmask) + per_cpu(cpu_closid, cpu) = 0; + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu closid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); + } + for_each_cpu(cpu, tmpmask) + per_cpu(cpu_closid, cpu) = rdtgrp->closid; + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + +end: + put_online_cpus(); +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + + return ret ?: nbytes; +} + +/* Files in each rdtgroup */ +static struct rftype rdtgroup_base_files[] = { + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + }, +}; + static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -582,6 +688,11 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + ret = rdtgroup_add_files(kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) + goto out_destroy; + kernfs_activate(kn); ret = 0; @@ -602,7 +713,7 @@ out_unlock: static int rdtgroup_rmdir(struct kernfs_node *kn) { struct rdtgroup *rdtgrp; - int ret = 0; + int cpu, ret = 0; rdtgrp = rdtgroup_kn_lock_live(kn); if (!rdtgrp) { @@ -610,6 +721,12 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) return -ENOENT; } + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(cpu_closid, cpu) = 0; + rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); list_del(&rdtgrp->rdtgroup_list); @@ -633,6 +750,8 @@ static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { static int __init rdtgroup_setup_root(void) { + int ret; + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, KERNFS_ROOT_CREATE_DEACTIVATED, &rdtgroup_default); @@ -644,12 +763,20 @@ static int __init rdtgroup_setup_root(void) rdtgroup_default.closid = 0; list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + rdtgroup_default.kn = rdt_root->kn; kernfs_activate(rdtgroup_default.kn); +out: mutex_unlock(&rdtgroup_mutex); - return 0; + return ret; } /* From e02737d5b82640497637d18428e2793bb7f02881 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:46 -0700 Subject: [PATCH 15/31] x86/intel_rdt: Add tasks files The root directory all subdirectories are automatically populated with a read/write (mode 0644) file named "tasks". When read it will show all the task IDs assigned to the resource group. Tasks can be added (one at a time) to a group by writing the task ID to the file. E.g. Membership in a resource group is indicated by a new field in the task_struct "int closid" which holds the CLOSID for each task. The default resource group uses CLOSID=0 which means that all existing tasks when the resctrl file system is mounted belong to the default group. If a group is removed, tasks which are members of that group are moved to the default group. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-8-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 180 +++++++++++++++++++++++ include/linux/sched.h | 3 + 2 files changed, 183 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index e05a18685fc8..5cc0865f2908 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -267,6 +268,162 @@ unlock: return ret ?: nbytes; } +struct task_move_callback { + struct callback_head work; + struct rdtgroup *rdtgrp; +}; + +static void move_myself(struct callback_head *head) +{ + struct task_move_callback *callback; + struct rdtgroup *rdtgrp; + + callback = container_of(head, struct task_move_callback, work); + rdtgrp = callback->rdtgrp; + + /* + * If resource group was deleted before this task work callback + * was invoked, then assign the task to root group and free the + * resource group. + */ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + current->closid = 0; + kfree(rdtgrp); + } + + kfree(callback); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + struct task_move_callback *callback; + int ret; + + callback = kzalloc(sizeof(*callback), GFP_KERNEL); + if (!callback) + return -ENOMEM; + callback->work.func = move_myself; + callback->rdtgrp = rdtgrp; + + /* + * Take a refcount, so rdtgrp cannot be freed before the + * callback has been invoked. + */ + atomic_inc(&rdtgrp->waitcount); + ret = task_work_add(tsk, &callback->work, true); + if (ret) { + /* + * Task is exiting. Drop the refcount and free the callback. + * No need to check the refcount as the group cannot be + * deleted before the write function unlocks rdtgroup_mutex. + */ + atomic_dec(&rdtgrp->waitcount); + kfree(callback); + } else { + tsk->closid = rdtgrp->closid; + } + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EPERM; + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + ret = rdtgroup_move_task(pid, rdtgrp, of); + else + ret = -ENOENT; + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (t->closid == r->closid) + seq_printf(s, "%d\n", t->pid); + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + /* Files in each rdtgroup */ static struct rftype rdtgroup_base_files[] = { { @@ -276,6 +433,13 @@ static struct rftype rdtgroup_base_files[] = { .write = rdtgroup_cpus_write, .seq_show = rdtgroup_cpus_show, }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + }, }; static int rdt_num_closids_show(struct kernfs_open_file *of, @@ -592,6 +756,13 @@ static void rdt_reset_pqr_assoc_closid(void *v) static void rmdir_all_sub(void) { struct rdtgroup *rdtgrp, *tmp; + struct task_struct *p, *t; + + /* move all tasks to default resource group */ + read_lock(&tasklist_lock); + for_each_process_thread(p, t) + t->closid = 0; + read_unlock(&tasklist_lock); get_cpu(); /* Reset PQR_ASSOC MSR on this cpu. */ @@ -712,6 +883,7 @@ out_unlock: static int rdtgroup_rmdir(struct kernfs_node *kn) { + struct task_struct *p, *t; struct rdtgroup *rdtgrp; int cpu, ret = 0; @@ -721,6 +893,14 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) return -ENOENT; } + /* Give any tasks back to the default group */ + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (t->closid == rdtgrp->closid) + t->closid = 0; + } + read_unlock(&tasklist_lock); + /* Give any CPUs back to the default group */ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..c8f4152e7265 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1791,6 +1791,9 @@ struct task_struct { /* cg_list protected by css_set_lock and tsk->alloc_lock */ struct list_head cg_list; #endif +#ifdef CONFIG_INTEL_RDT_A + int closid; +#endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT From 60ec2440c63dea88a5ef13e2b2549730a0d75a37 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 28 Oct 2016 15:04:47 -0700 Subject: [PATCH 16/31] x86/intel_rdt: Add schemata file Last of the per resource group files. Also mode 0644. This one shows the resources available to the group. Syntax depends on whether the "cdp" mount option was given. With code/data prioritization disabled it is simply a list of masks for each cache domain. Initial value allows access to all of the L3 cache on all domains. E.g. on a 2 socket Broadwell: L3:0=fffff;1=fffff With CDP enabled, separate masks for data and instructions are provided: L3DATA:0=fffff;1=fffff L3CODE:0=fffff;1=fffff Signed-off-by: Tony Luck Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-9-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 6 + arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 7 + arch/x86/kernel/cpu/intel_rdt_schemata.c | 245 +++++++++++++++++++++++ 4 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/intel_rdt_schemata.c diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index a0dd3e99038d..2e5eab09083e 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -73,6 +73,7 @@ struct rftype { * @num_domains: Number of domains active * @msr_base: Base MSR address for CBMs * @tmp_cbms: Scratch space when updating schemata + * @num_tmp_cbms: Number of CBMs in tmp_cbms * @cache_level: Which cache level defines scope of this domain * @cbm_idx_multi: Multiplier of CBM index * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: @@ -90,6 +91,7 @@ struct rdt_resource { int num_domains; int msr_base; u32 *tmp_cbms; + int num_tmp_cbms; int cache_level; int cbm_idx_multi; int cbm_idx_offset; @@ -170,4 +172,8 @@ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); void rdt_cbm_update(void *arg); struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); void rdtgroup_kn_unlock(struct kernfs_node *kn); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index b4334e86c1a9..c9f8c818d104 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -34,7 +34,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 5cc0865f2908..5c4bab9452b0 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -440,6 +440,13 @@ static struct rftype rdtgroup_base_files[] = { .write = rdtgroup_tasks_write, .seq_show = rdtgroup_tasks_show, }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + }, }; static int rdt_num_closids_show(struct kernfs_open_file *of, diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c new file mode 100644 index 000000000000..f369cb8db0d5 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -0,0 +1,245 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +/* + * Check whether a cache bit mask is valid. The SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + */ +static bool cbm_validate(unsigned long var, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit; + + if (var == 0 || var > r->max_cbm) + return false; + + first_bit = find_first_bit(&var, r->cbm_len); + zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + + if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + return false; + + if ((zero_bit - first_bit) < r->min_cbm_bits) + return false; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(char *buf, struct rdt_resource *r) +{ + unsigned long data; + int ret; + + ret = kstrtoul(buf, 16, &data); + if (ret) + return ret; + if (!cbm_validate(data, r)) + return -EINVAL; + r->tmp_cbms[r->num_tmp_cbms++] = data; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must appear in the + * right order. + */ +static int parse_line(char *line, struct rdt_resource *r) +{ + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + list_for_each_entry(d, &r->domains, list) { + dom = strsep(&line, ";"); + if (!dom) + return -EINVAL; + id = strsep(&dom, "="); + if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) + return -EINVAL; + if (parse_cbm(dom, r)) + return -EINVAL; + } + + /* Any garbage at the end of the line? */ + if (line && line[0]) + return -EINVAL; + return 0; +} + +static int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu, idx = 0; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on other cpus. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int closid, ret = 0; + u32 *l3_cbms = NULL; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + closid = rdtgrp->closid; + + /* get scratch space to save all the masks while we validate input */ + for_each_enabled_rdt_resource(r) { + r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), + GFP_KERNEL); + if (!r->tmp_cbms) { + ret = -ENOMEM; + goto out; + } + r->num_tmp_cbms = 0; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strsep(&tok, ":"); + if (!tok) { + ret = -EINVAL; + goto out; + } + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && + closid < r->num_closid) { + ret = parse_line(tok, r); + if (ret) + goto out; + break; + } + } + if (!r->name) { + ret = -EINVAL; + goto out; + } + } + + /* Did the parser find all the masks we need? */ + for_each_enabled_rdt_resource(r) { + if (r->num_tmp_cbms != r->num_domains) { + ret = -EINVAL; + goto out; + } + } + + for_each_enabled_rdt_resource(r) { + ret = update_domains(r, closid); + if (ret) + goto out; + } + +out: + rdtgroup_kn_unlock(of->kn); + for_each_enabled_rdt_resource(r) { + kfree(r->tmp_cbms); + r->tmp_cbms = NULL; + } + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + + seq_printf(s, "%s:", r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int closid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + closid = rdtgrp->closid; + for_each_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} From 4f341a5e48443fcc2e2d935ca990e462c02bb1a6 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:48 -0700 Subject: [PATCH 17/31] x86/intel_rdt: Add scheduler hook Hook the x86 scheduler code to update closid based on whether the current task is assigned to a specific closid or running on a CPU assigned to a specific closid. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-10-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 42 ++++++++++++++++++++++++ arch/x86/kernel/cpu/intel_rdt.c | 1 - arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 3 ++ arch/x86/kernel/process_32.c | 4 +++ arch/x86/kernel/process_64.c | 4 +++ 5 files changed, 53 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 2e5eab09083e..5bc72a4dbd5e 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -1,8 +1,12 @@ #ifndef _ASM_X86_INTEL_RDT_H #define _ASM_X86_INTEL_RDT_H +#ifdef CONFIG_INTEL_RDT_A + #include +#include + #define IA32_L3_QOS_CFG 0xc81 #define IA32_L3_CBM_BASE 0xc90 #define IA32_L2_CBM_BASE 0xd10 @@ -176,4 +180,42 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system. + * - Caches the per cpu CLOSid values and does the MSR write only + * when a task with a different CLOSid is scheduled in. + */ +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) { + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + int closid; + + /* + * If this task has a closid assigned, use it. + * Else use the closid assigned to this cpu. + */ + closid = current->closid; + if (closid == 0) + closid = this_cpu_read(cpu_closid); + + if (closid != state->closid) { + state->closid = closid; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); + } + } +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT_A */ #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 40094aed5f71..5a533fefefa0 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -29,7 +29,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 5c4bab9452b0..a90ad22b9823 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -292,6 +292,9 @@ static void move_myself(struct callback_head *head) kfree(rdtgrp); } + /* update PQR_ASSOC MSR to make resource group go into effect */ + intel_rdt_sched_in(); + kfree(callback); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..efe7f9fce44e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -54,6 +54,7 @@ #include #include #include +#include void __show_regs(struct pt_regs *regs, int all) { @@ -299,5 +300,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..acd7d6f507af 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -50,6 +50,7 @@ #include #include #include +#include __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -473,6 +474,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) loadsegment(ss, __KERNEL_DS); } + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } From 48553d103d0b63991a08980889c6a35b3e05b5e3 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 28 Oct 2016 15:04:49 -0700 Subject: [PATCH 18/31] MAINTAINERS: Add maintainer for Intel RDT resource allocation We create five new files for Intel RDT resource allocation: arch/x86/kernel/cpu/intel_rdt.c arch/x86/kernel/cpu/intel_rdt_rdtgroup.c arch/x86/kernel/cpu/intel_rdt_schemata.c arch/x86/include/asm/intel_rdt.h Documentation/x86/intel_rdt_ui.txt Fenghua Yu will maintain this code. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Shaohua Li" Cc: "Sai Prakhya" Cc: "Peter Zijlstra" Cc: "Stephane Eranian" Cc: "Dave Hansen" Cc: "David Carrillo-Cisneros" Cc: "Nilay Vaish" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "Borislav Petkov" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1477692289-37412-11-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c44795306342..4e6a04428c2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10107,6 +10107,14 @@ L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/sw/rdmavt +RDT - RESOURCE ALLOCATION +M: Fenghua Yu +L: linux-kernel@vger.kernel.org +S: Supported +F: arch/x86/kernel/cpu/intel_rdt* +F: arch/x86/include/asm/intel_rdt* +F: Documentation/x86/intel_rdt* + READ-COPY UPDATE (RCU) M: "Paul E. McKenney" M: Josh Triplett From 8ff42c02193150720815608ed03de8d45aeaf8a8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Nov 2016 17:51:17 +0100 Subject: [PATCH 19/31] x86/intel_rdt: Add a missing #include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... to fix this build warning: In file included from arch/x86/kernel/cpu/intel_rdt.c:33:0: ./arch/x86/include/asm/intel_rdt.h:56:25: warning: ‘struct kernfs_open_file’ declared inside parameter list will not be visible outside of this definition or declaration int (*seq_show)(struct kernfs_open_file *of, ^~~~~~~~~~~~~~~~ ... Signed-off-by: Borislav Petkov Cc: Dave Hansen Cc: David Carrillo-Cisneros Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Sai Prakhya Cc: Shaohua Li Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Luck Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/20161102165117.23545-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel_rdt.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 5bc72a4dbd5e..6e90e87eadfe 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -3,6 +3,7 @@ #ifdef CONFIG_INTEL_RDT_A +#include #include #include From 7bff0af51012500718971f9cc3485f67252353eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 14:09:05 -0700 Subject: [PATCH 20/31] x86/intel_rdt: Propagate error in rdt_mount() properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc complains: "warning: ‘dentry’ may be used uninitialized in this function" The error exit path in rdt_mount(), which deals with a failure in rdtgroup_create_info_dir(), does not set the error code in dentry and returns the uninitialized dentry value. Add the missing error propagation. [tglx: Massaged changelog ] Fixes: 4e978d06dedb ("x86/intel_rdt: Add "info" files to resctrl file system") Signed-off-by: Shaohua Li Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/a56a556f6768dc12cadbf97f49e000189056f90e.1478207143.git.shli@fb.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index a90ad22b9823..e66c7a58505e 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -691,8 +691,10 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type, closid_init(); ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret) + if (ret) { + dentry = ERR_PTR(ret); goto out_cdp; + } dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); From 53a114a69095eeb0e15d04c2a82358b3192f88df Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 14:09:06 -0700 Subject: [PATCH 21/31] x86/intel_rdt: Export the minimum number of set mask bits in sysfs The minimum number of bits set for a cache mask is checked by the kernel when writing a mask, but there is no way for the user to retrieve this information. Add a new file 'min_cbm_bits' to the info directory and export the information to user space. [ tglx: Massaged changelog ] Signed-off-by: Shaohua Li Cc: fenghua.yu@intel.com Cc: tony.luck@intel.com Link: http://lkml.kernel.org/r/e69b1ffa206d0353eea58101e1bf9b677d9732f7.1478207143.git.shli@fb.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index e66c7a58505e..4795880274f9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -472,6 +472,16 @@ static int rdt_cbm_mask_show(struct kernfs_open_file *of, return 0; } +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->min_cbm_bits); + + return 0; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_info_files[] = { { @@ -486,6 +496,12 @@ static struct rftype res_info_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_cbm_mask_show, }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + }, }; static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) From 458b0d6e751b04216873a5ee9c899be2cd2f80f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Nov 2016 11:58:12 +0100 Subject: [PATCH 22/31] x86/intel_rdt: Add info files to Documentation The content of the directories and files in the info directory of the resctrl filesystem are not documented. Add the missing bits and pieces. Signed-off-by: Thomas Gleixner Cc: Shaohua Li Cc: Fenghua Yu Cc: Tony Luck --- Documentation/x86/intel_rdt_ui.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt index 3b0ebd4cf423..d918d268cd72 100644 --- a/Documentation/x86/intel_rdt_ui.txt +++ b/Documentation/x86/intel_rdt_ui.txt @@ -17,6 +17,25 @@ mount options are: "cdp": Enable code/data prioritization in L3 cache allocations. +Info directory +-------------- + +The 'info' directory contains information about the enabled +resources. Each resource has its own subdirectory. The subdirectory +names reflect the resource names. Each subdirectory contains the +following files: + +"num_closids": The number of CLOSIDs which are valid for this + resource. The kernel uses the smallest number of + CLOSIDs of all enabled resources as limit. + +"cbm_mask": The bitmask which is valid for this resource. This + mask is equivalent to 100%. + +"min_cbm_bits": The minimum number of consecutive bits which must be + set when writing a mask. + + Resource groups --------------- Resource groups are represented as directories in the resctrl file From f57b308728902d9ffade53466e9201e999a870e4 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 11 Nov 2016 17:02:36 -0800 Subject: [PATCH 23/31] x86/intel_rdt: Protect info directory from removal The info directory and the per-resource subdirectories of the info directory have no reference to a struct rdtgroup in kn->priv. An attempt to remove one of those directories results in a NULL pointer dereference. Protect the directories from removal and return -EPERM instead of -ENOENT. [ tglx: Massaged changelog ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1478912558-55514-1-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 4795880274f9..cff286e6d69f 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -644,16 +644,29 @@ static int parse_rdtgroupfs_options(char *data) */ static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) { - if (kernfs_type(kn) == KERNFS_DIR) - return kn->priv; - else + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { return kn->parent->priv; + } } struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) { struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + if (!rdtgrp) + return NULL; + atomic_inc(&rdtgrp->waitcount); kernfs_break_active_protection(kn); @@ -670,6 +683,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) { struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + if (!rdtgrp) + return; + mutex_unlock(&rdtgroup_mutex); if (atomic_dec_and_test(&rdtgrp->waitcount) && @@ -918,7 +934,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) rdtgrp = rdtgroup_kn_lock_live(kn); if (!rdtgrp) { rdtgroup_kn_unlock(kn); - return -ENOENT; + return -EPERM; } /* Give any tasks back to the default group */ From a2584e1d5a74e8f5387d6484ad4c73ded36aa2ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Nov 2016 15:12:13 +0100 Subject: [PATCH 24/31] x86/intel_rdt: Prevent deadlock against hotplug lock The cpu online/offline callbacks of intel_rdt lock rdtgroup_mutex nested inside of cpu hotplug lock. rdtgroup_cpus_write() does it in reverse order. Remove the get/put_online_cpus() calls from rdtgroup_cpus_write(). This is safe against cpu hotplug as the resource group cpumasks are protected by rdtgroup_mutex. Found by review, but should have been found if authors would have bothered to test cpu hotplug with lockdep enabled. Signed-off-by: Thomas Gleixner Cc: Shaohua Li Cc: Fenghua Yu Cc: Tony Luck --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index cff286e6d69f..2f54931e0fa9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -207,6 +207,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, free_cpumask_var(tmpmask); return -ENOMEM; } + rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { ret = -ENOENT; @@ -217,12 +218,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, if (ret) goto unlock; - get_online_cpus(); /* check that user didn't specify any offline cpus */ cpumask_andnot(tmpmask, newmask, cpu_online_mask); if (cpumask_weight(tmpmask)) { ret = -EINVAL; - goto end; + goto unlock; } /* Check whether cpus are dropped from this group */ @@ -231,7 +231,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* Can't drop from default group */ if (rdtgrp == &rdtgroup_default) { ret = -EINVAL; - goto end; + goto unlock; } /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, @@ -258,8 +258,6 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* Done pushing/pulling - update this group with new mask */ cpumask_copy(&rdtgrp->cpu_mask, newmask); -end: - put_online_cpus(); unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); From 59fe5a77d473f3519dbee8ef5e77c69897a838f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Nov 2016 15:17:12 +0100 Subject: [PATCH 25/31] x86/intel_rdt: Select KERNFS when enabling INTEL_RDT_A arch/x86/kernel/cpu/intel_rdt_rdtgroup.c: In function 'rdtgroup_kn_lock_live': arch/x86/kernel/cpu/intel_rdt_rdtgroup.c:658:2: error: implicit declaration of function 'kernfs_break_active_protection' [-Werror=implicit-function-declaration] Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Shaohua Li Cc: Fenghua Yu Cc: Tony Luck --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 770fb5f23cea..dcca4ec42770 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -411,6 +411,7 @@ config INTEL_RDT_A bool "Intel Resource Director Technology Allocation support" default n depends on X86 && CPU_SUP_INTEL + select KERNFS help Select to enable resource allocation which is a sub-feature of Intel Resource Director Technology(RDT). More information about From c7cc0cc10cdecc275211c8749defba6c41aaf5de Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 11 Nov 2016 17:02:37 -0800 Subject: [PATCH 26/31] x86/intel_rdt: Reset per cpu closids on unmount All CPUs in a rdtgroup are given back to the default rdtgroup before the rdtgroup is removed during umount. After umount, the default rdtgroup contains all online CPUs, but the per cpu closids are not cleared. As a result the stale closid value will be used immediately after the next mount. Move all cpus to the default group and update the percpu closid storage. [ tglx: Massaged changelong ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1478912558-55514-2-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 2f54931e0fa9..d6bad092f542 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -799,6 +799,7 @@ static void rmdir_all_sub(void) { struct rdtgroup *rdtgrp, *tmp; struct task_struct *p, *t; + int cpu; /* move all tasks to default resource group */ read_lock(&tasklist_lock); @@ -813,14 +814,29 @@ static void rmdir_all_sub(void) smp_call_function_many(cpu_online_mask, rdt_reset_pqr_assoc_closid, NULL, 1); put_cpu(); + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } + + /* Reset all per cpu closids to the default value */ + for_each_cpu(cpu, &rdtgroup_default.cpu_mask) + per_cpu(cpu_closid, cpu) = 0; + kernfs_remove(kn_info); } From f410770293a1fbc08906474c24104a7a11943eb6 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 11 Nov 2016 17:02:38 -0800 Subject: [PATCH 27/31] x86/intel_rdt: Update percpu closid immeditately on CPUs affected by changee If CPUs are moved to or removed from a rdtgroup, the percpu closid storage is updated. If tasks running on an affected CPU use the percpu closid then the PQR_ASSOC MSR is only updated when the task runs through a context switch. Up to the context switch the CPUs operate on the wrong closid. This state is potentially unbound. Make the change immediately effective by invoking a smp function call on the affected CPUs which stores the new closid in the perpu storage and calls the rdt_sched_in() function which updates the MSR, if the current task uses the percpu closid. [ tglx: Made it work and massaged changelog once more ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "Ingo Molnar" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1478912558-55514-3-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 72 ++++++++++++------------ 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index d6bad092f542..98edba431033 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -191,12 +191,40 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, return ret; } +/* + * This is safe against intel_rdt_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from rdt_update_percpu_closid() is proteced against __switch_to() because + * preemption is disabled. + */ +static void rdt_update_cpu_closid(void *v) +{ + this_cpu_write(cpu_closid, *(int *)v); + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + intel_rdt_sched_in(); +} + +/* Update the per cpu closid and eventually the PGR_ASSOC MSR */ +static void rdt_update_percpu_closid(const struct cpumask *cpu_mask, int closid) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_update_cpu_closid(&closid); + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, &closid, 1); + put_cpu(); +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { cpumask_var_t tmpmask, newmask; struct rdtgroup *rdtgrp, *r; - int ret, cpu; + int ret; if (!buf) return -EINVAL; @@ -236,8 +264,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, tmpmask); - for_each_cpu(cpu, tmpmask) - per_cpu(cpu_closid, cpu) = 0; + rdt_update_percpu_closid(tmpmask, rdtgroup_default.closid); } /* @@ -251,8 +278,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, continue; cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); } - for_each_cpu(cpu, tmpmask) - per_cpu(cpu_closid, cpu) = rdtgrp->closid; + rdt_update_percpu_closid(tmpmask, rdtgroup_default.closid); } /* Done pushing/pulling - update this group with new mask */ @@ -780,18 +806,6 @@ static int reset_all_cbms(struct rdt_resource *r) return 0; } -/* - * MSR_IA32_PQR_ASSOC is scoped per logical CPU, so all updates - * are always in thread context. - */ -static void rdt_reset_pqr_assoc_closid(void *v) -{ - struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); -} - /* * Forcibly remove all of subdirectories under root. */ @@ -799,7 +813,6 @@ static void rmdir_all_sub(void) { struct rdtgroup *rdtgrp, *tmp; struct task_struct *p, *t; - int cpu; /* move all tasks to default resource group */ read_lock(&tasklist_lock); @@ -807,14 +820,6 @@ static void rmdir_all_sub(void) t->closid = 0; read_unlock(&tasklist_lock); - get_cpu(); - /* Reset PQR_ASSOC MSR on this cpu. */ - rdt_reset_pqr_assoc_closid(NULL); - /* Reset PQR_ASSOC MSR on the rest of cpus. */ - smp_call_function_many(cpu_online_mask, rdt_reset_pqr_assoc_closid, - NULL, 1); - put_cpu(); - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) @@ -828,15 +833,13 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + rdt_update_percpu_closid(&rdtgrp->cpu_mask, + rdtgroup_default.closid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } - - /* Reset all per cpu closids to the default value */ - for_each_cpu(cpu, &rdtgroup_default.cpu_mask) - per_cpu(cpu_closid, cpu) = 0; - kernfs_remove(kn_info); } @@ -943,7 +946,6 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) { struct task_struct *p, *t; struct rdtgroup *rdtgrp; - int cpu, ret = 0; rdtgrp = rdtgroup_kn_lock_live(kn); if (!rdtgrp) { @@ -962,8 +964,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) /* Give any CPUs back to the default group */ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = 0; + rdt_update_percpu_closid(&rdtgrp->cpu_mask, rdtgroup_default.closid); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); @@ -977,8 +978,7 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) kernfs_remove(rdtgrp->kn); rdtgroup_kn_unlock(kn); - - return ret; + return 0; } static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { From 2659f46da8307871989f475accdcdfc4807e9e6c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 18 Nov 2016 15:18:03 -0800 Subject: [PATCH 28/31] x86/intel_rdt: Fix setting of closid when adding CPUs to a group There was a cut & paste error when adding code to update the per-cpu closid when changing the bitmask of CPUs to an rdt group. The update erronously assigns the closid of the default group to the CPUs which are moved to a group instead of assigning the closid of their new group. Use the proper closid. Fixes: f410770293a1 ("x86/intel_rdt: Update percpu closid immeditately on CPUs affected by change") Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1479511084-59727-1-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 98edba431033..eccea8a6adf8 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -278,7 +278,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, continue; cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); } - rdt_update_percpu_closid(tmpmask, rdtgroup_default.closid); + rdt_update_percpu_closid(tmpmask, rdtgrp->closid); } /* Done pushing/pulling - update this group with new mask */ From 0efc89be9471b152599d2db7eb47de8e0d71c59f Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 18 Nov 2016 15:18:04 -0800 Subject: [PATCH 29/31] x86/intel_rdt: Update task closid immediately on CPU in rmdir and unmount When removing a sub directory/rdtgroup by rmdir or umount, closid in a task in the sub directory is set to default rdtgroup's closid which is 0. If the task is running on a CPU, the PQR_ASSOC MSR is only updated when the task runs through a context switch. Up to the context switch, the task runs with the wrong closid. Make the change immediately effective by invoking a smp function call on all CPUs which are running moved task. If one of the affected tasks was moved or scheduled out before the function call is executed on the CPU the only damage is the extra interruption of the CPU. [ tglx: Reworked it to avoid blindly interrupting all CPUs and extra loops ] Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1479511084-59727-2-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 113 +++++++++++++++++------ 1 file changed, 83 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index eccea8a6adf8..fb8e03e2b70d 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -194,12 +194,13 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_percpu_closid() is proteced against __switch_to() because + * from rdt_update_closid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *v) +static void rdt_update_cpu_closid(void *closid) { - this_cpu_write(cpu_closid, *(int *)v); + if (closid) + this_cpu_write(cpu_closid, *(int *)closid); /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -208,14 +209,23 @@ static void rdt_update_cpu_closid(void *v) intel_rdt_sched_in(); } -/* Update the per cpu closid and eventually the PGR_ASSOC MSR */ -static void rdt_update_percpu_closid(const struct cpumask *cpu_mask, int closid) +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids must have been set up before calling this function. + * + * The per cpu closids are updated with the smp function call, when @closid + * is not NULL. If @closid is NULL then all affected percpu closids must + * have been set up before calling this function. + */ +static void +rdt_update_closid(const struct cpumask *cpu_mask, int *closid) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(&closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, &closid, 1); + rdt_update_cpu_closid(closid); + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); put_cpu(); } @@ -264,7 +274,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, /* Give any dropped cpus to rdtgroup_default */ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_percpu_closid(tmpmask, rdtgroup_default.closid); + rdt_update_closid(tmpmask, &rdtgroup_default.closid); } /* @@ -278,7 +288,7 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, continue; cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); } - rdt_update_percpu_closid(tmpmask, rdtgrp->closid); + rdt_update_closid(tmpmask, &rdtgrp->closid); } /* Done pushing/pulling - update this group with new mask */ @@ -806,19 +816,50 @@ static int reset_all_cbms(struct rdt_resource *r) return 0; } +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || t->closid == from->closid) { + t->closid = to->closid; +#ifdef CONFIG_SMP + /* + * This is safe on x86 w/o barriers as the ordering + * of writing to task_cpu() and t->on_cpu is + * reverse to the reading here. The detection is + * inaccurate as tasks might move or schedule + * before the smp function call takes place. In + * such a case the function call is pointless, but + * there is no other side effect. + */ + if (mask && t->on_cpu) + cpumask_set_cpu(task_cpu(t), mask); +#endif + } + } + read_unlock(&tasklist_lock); +} + /* * Forcibly remove all of subdirectories under root. */ static void rmdir_all_sub(void) { struct rdtgroup *rdtgrp, *tmp; - struct task_struct *p, *t; - /* move all tasks to default resource group */ - read_lock(&tasklist_lock); - for_each_process_thread(p, t) - t->closid = 0; - read_unlock(&tasklist_lock); + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { /* Remove each rdtgroup other than root */ @@ -833,13 +874,15 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - rdt_update_percpu_closid(&rdtgrp->cpu_mask, - rdtgroup_default.closid); - kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + get_online_cpus(); + rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + put_online_cpus(); + kernfs_remove(kn_info); } @@ -944,27 +987,35 @@ out_unlock: static int rdtgroup_rmdir(struct kernfs_node *kn) { - struct task_struct *p, *t; + int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; rdtgrp = rdtgroup_kn_lock_live(kn); if (!rdtgrp) { - rdtgroup_kn_unlock(kn); - return -EPERM; + ret = -EPERM; + goto out; } /* Give any tasks back to the default group */ - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (t->closid == rdtgrp->closid) - t->closid = 0; - } - read_unlock(&tasklist_lock); + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); /* Give any CPUs back to the default group */ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - rdt_update_percpu_closid(&rdtgrp->cpu_mask, rdtgroup_default.closid); + + /* Update per cpu closid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(cpu_closid, cpu) = closid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + rdt_update_closid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); @@ -976,9 +1027,11 @@ static int rdtgroup_rmdir(struct kernfs_node *kn) */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - + ret = 0; +out: rdtgroup_kn_unlock(kn); - return 0; + free_cpumask_var(tmpmask); + return ret; } static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { From 74fcdae1a7fdf30de5413ccc1eca271415d01124 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 1 Dec 2016 12:55:14 -0800 Subject: [PATCH 30/31] x86/intel_rdt: Call intel_rdt_sched_in() with preemption disabled intel_rdt_sched_in() must be called with preemption disabled because the function accesses percpu variables (pqr_state and closid). If a task moves itself via move_myself() preemption is enabled, which violates the calling convention and can result in incorrect closid selection when the task gets preempted or migrated. Add the required protection and a comment about the calling convention. Signed-off-by: Fenghua Yu Cc: "Ravi V Shankar" Cc: "Tony Luck" Cc: "Marcelo Tosatti" Cc: "Sai Prakhya" Cc: "Vikas Shivappa" Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1480625714-54246-1-git-send-email-fenghua.yu@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/intel_rdt.h | 2 ++ arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 6e90e87eadfe..95ce5c85b009 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -192,6 +192,8 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, * resctrl file system. * - Caches the per cpu CLOSid values and does the MSR write only * when a task with a different CLOSid is scheduled in. + * + * Must be called with preemption disabled. */ static inline void intel_rdt_sched_in(void) { diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index fb8e03e2b70d..1afd3f393501 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -326,8 +326,10 @@ static void move_myself(struct callback_head *head) kfree(rdtgrp); } + preempt_disable(); /* update PQR_ASSOC MSR to make resource group go into effect */ intel_rdt_sched_in(); + preempt_enable(); kfree(callback); } From 76ae054c69a745ded388fc4ae70422d74c5bc77d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 2 Dec 2016 14:21:06 -0800 Subject: [PATCH 31/31] x86/intel_rdt: Implement show_options() for resctrlfs Implement show_options() callback for intel resource control filesystem to expose the active mount options in /proc/ Signed-off-by: Shaohua Li Cc: Fenghua Yu Link: http://lkml.kernel.org/r/7dce7c1886ac9289442d254ea18322c92bd968da.1480717072.git.shli@fb.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 1afd3f393501..8af04afdfcb9 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -1036,9 +1036,17 @@ out: return ret; } +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + seq_puts(seq, ",cdp"); + return 0; +} + static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .show_options = rdtgroup_show_options, }; static int __init rdtgroup_setup_root(void)