[Linaro-open-discussions] [PATCH 3/3] scheduler: Add cluster scheduler level for x86

16 Jun 2021

From: Tim Chen tim.c.chen@linux.intel.com
There are x86 CPU architectures (e.g. Jacobsville) where L2 cahce
is shared among a cluster of cores instead of being exclusive
to one single core.
To prevent oversubscription of L2 cache, load should be
balanced between such L2 clusters, especially for tasks with
no shared data.
On benchmark such as SPECrate mcf test, this change provides a
boost to performance on medium load system on Jacobsville.
[
Hi Tim,
Could you please post some supportive data here?
Thanks
Barry
]
Note that this added domain level will increase migrations
between CPUs.  So this is not necessarily a universal win if
the migration cost of balancing L2 load outweigh the benefit
from reduced L2 contention.  This change tends to benefit CPU
bound threads that get moved around much less.
Signed-off-by: Tim Chen tim.c.chen@linux.intel.com
Signed-off-by: Barry Song song.bao.hua@hisilicon.com
---
 arch/x86/Kconfig                |  8 ++++++
 arch/x86/include/asm/smp.h      |  7 ++++++
 arch/x86/include/asm/topology.h |  3 +++
 arch/x86/kernel/cpu/cacheinfo.c |  1 +
 arch/x86/kernel/cpu/common.c    |  3 +++
 arch/x86/kernel/smpboot.c       | 44 ++++++++++++++++++++++++++++++++-
 6 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..5a9777707fbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1007,6 +1007,14 @@ config NR_CPUS
      This is purely to save memory: each supported CPU adds about 8KB
      to the kernel image.
+config SCHED_CLUSTER
+	bool "Cluster scheduler support"
+	default n
+	help
+	 Cluster scheduler support improves the CPU scheduler's decision
+	 making when dealing with machines that have clusters of CPUs
+	 sharing L2 cache. If unsure say N here.
+
 config SCHED_SMT
    def_bool y if SMP
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 630ff08532be..08b0e90623ad 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,9 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
 /* cpus sharing the last level cache: */
 DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
static inline struct cpumask *cpu_llc_shared_mask(int cpu)
@@ -24,6 +26,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
    return per_cpu(cpu_llc_shared_map, cpu);
 }
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+	return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 9239399e5491..2548d824f103 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -103,17 +103,20 @@ static inline void setup_node_to_cpumask_map(void) { }
 #include <asm-generic/topology.h>
extern const struct cpumask *cpu_coregroup_mask(int cpu);
+extern const struct cpumask *cpu_clustergroup_mask(int cpu);
#define topology_logical_package_id(cpu)	(cpu_data(cpu).logical_proc_id)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
 #define topology_logical_die_id(cpu)		(cpu_data(cpu).logical_die_id)
 #define topology_die_id(cpu)			(cpu_data(cpu).cpu_die_id)
+#define topology_cluster_id(cpu)		(per_cpu(cpu_l2c_id, cpu))
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
extern unsigned int __max_die_per_package;
#ifdef CONFIG_SMP
 #define topology_die_cpumask(cpu)		(per_cpu(cpu_die_map, cpu))
+#define topology_cluster_cpumask(cpu)		(cpu_clustergroup_mask(cpu))
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index d66af2950e06..3528987fef1d 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -846,6 +846,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
    	l2 = new_l2;
 #ifdef CONFIG_SMP
    	per_cpu(cpu_llc_id, cpu) = l2_id;
+		per_cpu(cpu_l2c_id, cpu) = l2_id;
 #endif
    }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a1b756c49a93..8bf1049ff614 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -78,6 +78,9 @@ EXPORT_SYMBOL(smp_num_siblings);
 /* Last level cache ID of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
+/* L2 cache ID of each logical CPU */
+DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id) = BAD_APICID;
+
 /* correctly size the local cpu masks */
 void __init setup_cpu_local_masks(void)
 {
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7770245cc7fa..3162d0fc6b3c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,6 +101,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
 /* Per CPU bogomips and other parameters */
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
@@ -466,6 +468,21 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
    return false;
 }
+static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+	/* Do not match if we do not have a valid APICID for cpu: */
+	if (per_cpu(cpu_l2c_id, cpu1) == BAD_APICID)
+		return false;
+
+	/* Do not match if L2 cache id does not match: */
+	if (per_cpu(cpu_l2c_id, cpu1) != per_cpu(cpu_l2c_id, cpu2))
+		return false;
+
+	return topology_sane(c, o, "l2c");
+}
+
 /*
  * Unlike the other levels, we do not enforce keeping a
  * multicore group inside a NUMA node.  If this happens, we will
@@ -525,7 +542,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 }
-#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_CLUSTER) || defined(CONFIG_SCHED_MC)
 static inline int x86_sched_itmt_flags(void)
 {
    return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
@@ -543,12 +560,21 @@ static int x86_smt_flags(void)
    return cpu_smt_flags() | x86_sched_itmt_flags();
 }
 #endif
+#ifdef CONFIG_SCHED_CLUSTER
+static int x86_cluster_flags(void)
+{
+	return cpu_cluster_flags() | x86_sched_itmt_flags();
+}
+#endif
 #endif
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
 #ifdef CONFIG_SCHED_SMT
    { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 #endif
+#ifdef CONFIG_SCHED_CLUSTER
+	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
 #ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 #endif
@@ -559,6 +585,9 @@ static struct sched_domain_topology_level x86_topology[] = {
 #ifdef CONFIG_SCHED_SMT
    { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 #endif
+#ifdef CONFIG_SCHED_CLUSTER
+	{ cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
 #ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 #endif
@@ -586,6 +615,7 @@ void set_cpu_sibling_map(int cpu)
    if (!has_mp) {
    	cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
    	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
+		cpumask_set_cpu(cpu, cpu_l2c_shared_mask(cpu));
    	cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
    	cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
    	c->booted_cores = 1;
@@ -604,6 +634,9 @@ void set_cpu_sibling_map(int cpu)
    	if ((i == cpu) || (has_mp && match_llc(c, o)))
    		link_mask(cpu_llc_shared_mask, cpu, i);
+		if ((i == cpu) || (has_mp && match_l2c(c, o)))
+			link_mask(cpu_l2c_shared_mask, cpu, i);
+
    	if ((i == cpu) || (has_mp && match_die(c, o)))
    		link_mask(topology_die_cpumask, cpu, i);
    }
@@ -651,6 +684,11 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
    return cpu_llc_shared_mask(cpu);
 }
+const struct cpumask *cpu_clustergroup_mask(int cpu)
+{
+	return cpu_l2c_shared_mask(cpu);
+}
+
 static void impress_friends(void)
 {
    int cpu;
@@ -1334,6 +1372,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
    	zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
    	zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
    	zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
    }
/*
@@ -1558,7 +1597,10 @@ static void remove_siblinginfo(int cpu)
    	cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
    for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
    	cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+	for_each_cpu(sibling, cpu_l2c_shared_mask(cpu))
+		cpumask_clear_cpu(cpu, cpu_l2c_shared_mask(sibling));
    cpumask_clear(cpu_llc_shared_mask(cpu));
+	cpumask_clear(cpu_l2c_shared_mask(cpu));
    cpumask_clear(topology_sibling_cpumask(cpu));
    cpumask_clear(topology_core_cpumask(cpu));
    cpumask_clear(topology_die_cpumask(cpu));
-- 
2.25.1


    

2026

2025

2024

2023

2022

2021

2020

[Linaro-open-discussions] [PATCH 3/3] scheduler: Add cluster scheduler level for x86