Hi Yicong and Tim, This is the 2nd patchset following the 1st one: https://op-lists.linaro.org/pipermail/linaro-open-discussions/2021-June/0001...
While the 1st patchset was only focusing on spreading path, this patchset is mainly for packing path. I have only tested tbench4 on one numa. Without this patchset, I am seeing up to 5% performance decrease on tbench4 by spreading path only; but with it, I can see up to 28% performance increase, compared to the case of w/o cluster scheduler. I am running the benchmark by Mel's mmtests with this config file: configs/config-scheduler-schbench-1numa # MM Test Parameters export MMTESTS="tbench4"
# List of monitors export RUN_MONITOR=yes export MONITORS_ALWAYS= export MONITORS_GZIP="proc-vmstat mpstat" export MONITORS_WITH_LATENCY="vmstat" export MONITOR_UPDATE_FREQUENCY=10
# TBench export TBENCH_DURATION=60 export TBENCH_MIN_CLIENTS=1 export TBENCH_MAX_CLIENTS=96
with commands like: numactl -N 0 -m 0 ./run-mmtests.sh --no-monitor -c configs/config-scheduler-schbench-1numa testtag
my machine has 4 numa, each numa has 24 cores(6 clusters).
Hopefully, we are going to have more benchmark cases like pgbench, hackbench etc on both one numa and four numa.
Hi Yicong, Note we might need to test the case jumplabel is disabled.
Thanks Barry
Barry Song (4): sched: Add infrastructure to describe if cluster scheduler is really running sched: Add per_cpu cluster domain info and cpus_share_cluster API sched/fair: Scan cluster before scanning llc in wake-up path sched/fair: Use cpus_share_cluster to further pull wakee
include/linux/sched/cluster.h | 19 ++++++++++++++ include/linux/sched/sd_flags.h | 9 +++++++ include/linux/sched/topology.h | 8 +++++- kernel/sched/core.c | 28 ++++++++++++++++++++ kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 3 +++ kernel/sched/topology.c | 11 ++++++++ 7 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 include/linux/sched/cluster.h
use sched_cluster_present static key to describe if cluster scheduler is really working. and add sched_cluster_active() API to reflect the status of cluster scheduler as well. Only if cluster is bigger than smt, and less than coregroup, we are quite sure cluster scheduler is used.
Signed-off-by: Barry Song song.bao.hua@hisilicon.com --- include/linux/sched/cluster.h | 19 +++++++++++++++++++ kernel/sched/core.c | 20 ++++++++++++++++++++ kernel/sched/fair.c | 4 ++++ kernel/sched/sched.h | 1 + 4 files changed, 44 insertions(+) create mode 100644 include/linux/sched/cluster.h
diff --git a/include/linux/sched/cluster.h b/include/linux/sched/cluster.h new file mode 100644 index 0000000..ea6c475 --- /dev/null +++ b/include/linux/sched/cluster.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_CLUSTER_H +#define _LINUX_SCHED_CLUSTER_H + +#include <linux/static_key.h> + +#ifdef CONFIG_SCHED_CLUSTER +extern struct static_key_false sched_cluster_present; + +static __always_inline bool sched_cluster_active(void) +{ + return static_branch_likely(&sched_cluster_present); +} +#else +static inline bool sched_cluster_active(void) { return false; } + +#endif + +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9e9a5be..dd5984d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8655,6 +8655,17 @@ int sched_cpu_activate(unsigned int cpu) if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_present); #endif + +#ifdef CONFIG_SCHED_CLUSTER + /* + * When going up, increment the number of cluster cpus with + * cluster present. + */ + if (cpumask_weight(cpu_cluster_mask(cpu)) > cpumask_weight(cpu_smt_mask(cpu)) && + cpumask_weight(cpu_cluster_mask(cpu)) < cpumask_weight(cpu_coregroup_mask(cpu))) + static_branch_inc_cpuslocked(&sched_cluster_present); +#endif + set_cpu_active(cpu, true);
if (sched_smp_initialized) { @@ -8731,6 +8742,15 @@ int sched_cpu_deactivate(unsigned int cpu) static_branch_dec_cpuslocked(&sched_smt_present); #endif
+#ifdef CONFIG_SCHED_CLUSTER + /* + * When going down, decrement the number of cpus with cluster present. + */ + if (cpumask_weight(cpu_cluster_mask(cpu)) > cpumask_weight(cpu_smt_mask(cpu)) && + cpumask_weight(cpu_cluster_mask(cpu)) < cpumask_weight(cpu_coregroup_mask(cpu))) + static_branch_dec_cpuslocked(&sched_cluster_present); +#endif + if (!sched_smp_initialized) return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce625bf..8578cb1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5997,6 +5997,10 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p) return -1; }
+#ifdef CONFIG_SCHED_CLUSTER +DEFINE_STATIC_KEY_FALSE(sched_cluster_present); +#endif + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8f0194c..0f8f610 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@
#include <linux/sched/autogroup.h> #include <linux/sched/clock.h> +#include <linux/sched/cluster.h> #include <linux/sched/coredump.h> #include <linux/sched/cpufreq.h> #include <linux/sched/cputime.h>
wake_affine will need per_cpu cluster domain and cpus_share_cluster to select the proper idle sibling in the cluster.
Signed-off-by: Barry Song song.bao.hua@hisilicon.com --- include/linux/sched/sd_flags.h | 9 +++++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 5 +++++ kernel/sched/sched.h | 2 ++ kernel/sched/topology.c | 11 +++++++++++ 5 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 34b21e9..e4e651e 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -100,6 +100,15 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
/* + * Domain members share CPU cluster resources (i.e. llc cache tags or l2) + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * the cluster resouces (such as llc tags or l2) + * NEEDS_GROUPS: Caches are shared between groups. + */ +SD_FLAG(SD_SHARE_CLS_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + +/* * Domain members share CPU package resources (i.e. caches) * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f..feb6339 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CLS_RESOURCES | SD_SHARE_PKG_RESOURCES; } #endif
@@ -177,6 +177,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_cluster(int this_cpu, int that_cpu);
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -230,6 +231,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; }
+bool cpus_share_cluster(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd5984d..47a4c82 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3502,6 +3502,11 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); }
+bool cpus_share_cluster(int this_cpu, int that_cpu) +{ + return per_cpu(sd_cluster_id, this_cpu) == per_cpu(sd_cluster_id, that_cpu); +} + static inline bool ttwu_queue_cond(int cpu, int wake_flags) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0f8f610..193e70d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1765,6 +1765,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); +DECLARE_PER_CPU(int, sd_cluster_id); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2b1bc26..63e17fe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -644,6 +644,8 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); +DEFINE_PER_CPU(int, sd_cluster_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -657,6 +659,14 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1;
+ sd = highest_flag_domain(cpu, SD_SHARE_CLS_RESOURCES); + if (sd) { + id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + } + rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); + per_cpu(sd_cluster_id, cpu) = id; + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); if (sd) { id = cpumask_first(sched_domain_span(sd)); @@ -1392,6 +1402,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_CLS_RESOURCES | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING)
if we only do spreading in patchset[1], for workload who is sensitive to wake-up path and cache affinity, like tbench4, we are seeing performance degradation: (one numa, 24cores, 6 clusters) tbench4 tbench tbench 1numa-w/o-cluster 1numa-w/-cluster-spread-only
Hmean 1 314.11 ( 0.00%) 297.22 * -5.38%* Hmean 2 612.83 ( 0.00%) 597.33 * -2.53%* Hmean 4 1232.90 ( 0.00%) 1211.53 * -1.73%* Hmean 8 2470.26 ( 0.00%) 2465.05 * -0.21%* Hmean 16 3203.89 ( 0.00%) 3194.06 * -0.31%* Hmean 32 6210.83 ( 0.00%) 5984.82 * -3.64%* Hmean 64 5053.54 ( 0.00%) 5017.10 * -0.72%* Hmean 96 4726.44 ( 0.00%) 4705.88 * -0.44%*
This patch moves to scan cluster before scanning LLC if waker and wakee are in different LLCs; if they are in the same LLC, this patch moves to scan cluster only unless llc's has_idle_core is true.
With this patch to change scanning path, we are seeing obvious improvement on tbench4 while system load is relatively heavy: (one numa, 24cores, 6 clusters) tbench4 tbench tbench 1numa-w/-cluster-spread-only 1numa-w/-cluster-spread+scancluster
Hmean 1 297.22 ( 0.00%) 297.40 ( 0.06%) Hmean 2 597.33 ( 0.00%) 597.32 ( -0.00%) Hmean 4 1211.53 ( 0.00%) 1225.72 * 1.17%* Hmean 8 2465.05 ( 0.00%) 2508.96 * 1.78%* Hmean 16 3194.06 ( 0.00%) 4029.90 * 26.17%* Hmean 32 5984.82 ( 0.00%) 6376.05 * 6.54%* Hmean 64 5017.10 ( 0.00%) 5250.62 * 4.65%* Hmean 96 4705.88 ( 0.00%) 4882.62 * 3.76%*
but while system load is relatively light, cluster-scheduler with this patch is still making worse performance, compared to the system without cluster-scheduler: (one numa, 24cores, 6 clusters) tbench4 tbench tbench 1numa-w/o-cluster 1numa-w/-cluster-spread+scancluster
Hmean 1 314.11 ( 0.00%) 297.40 * -5.32%* Hmean 2 612.83 ( 0.00%) 597.32 * -2.53%* Hmean 4 1232.90 ( 0.00%) 1225.72 * -0.58%* Hmean 8 2470.26 ( 0.00%) 2508.96 * 1.57%* Hmean 16 3203.89 ( 0.00%) 4029.90 * 25.78%* Hmean 32 6210.83 ( 0.00%) 6376.05 * 2.66%* Hmean 64 5053.54 ( 0.00%) 5250.62 * 3.90%* Hmean 96 4726.44 ( 0.00%) 4882.62 * 3.30%* [ Hi Tim, I don't have a machine with both cluster and smt to test. if you are the case, pls feel free to edit select_idle_cluster and select_idle_sibling
According to https://en.wikipedia.org/wiki/Tremont_(microarchitecture)#Processors_for_bas...) it seems you aren't smt too?
Thanks Barry ]
[1]https://op-lists.linaro.org/pipermail/linaro-open-discussions/2021-June/0001...
[ Todo: need pgbench, hackbench on 1numa, 4numa; need tbench4 on 4numa ] Signed-off-by: Barry Song song.bao.hua@hisilicon.com --- kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8578cb1..fdc2cf8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6132,6 +6132,32 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd #endif /* CONFIG_SCHED_SMT */
/* + * Scan the cluster domain for idle CPUs + */ +static int select_idle_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1; + + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } + } + + return idle_cpu; +} + +/* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). @@ -6141,7 +6167,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd, *cluster_sd; u64 time;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); @@ -6149,6 +6175,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); + if (cluster_sd) + cpumask_andnot(cpus, cpus, sched_domain_span(cluster_sd));
if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -6337,6 +6366,23 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } }
+ if (sched_cluster_active()) { + struct sched_domain *cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); + if (cluster_sd) { + i = select_idle_cluster(p, cluster_sd, has_idle_core, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + /* + * if prev and target are not in same LLC, give other cpus who have + * same LLC with target one chance as they are closer than target + * though they are not the closest; otherwise, no need to scan LLC; + * for smt, we always select idle core in the whole LLC + */ + if (cpus_share_cache(prev, target) && !has_idle_core) + return target; + } + } + i = select_idle_cpu(p, sd, has_idle_core, target); if ((unsigned)i < nr_cpumask_bits) return i;
In wake_affine_idle() and select_idle_sibling(), move to use cpus_share_cluster() rather than cpus_share_cache() to determine if waker and wakee are in same cache domain. This patch mainly benefits light load system.
we are seeing a huge improvement while load is relatively light: (1numa, 24cores, 6 clusters) tbench4 tbench tbench 1numa-w/-cluster-spread-only 1numa-w-cluster-spread+scancluster+cpus_share_cluster Hmean 1 297.22 ( 0.00%) 331.31 * 11.47%* Hmean 2 597.33 ( 0.00%) 661.35 * 10.72%* Hmean 4 1211.53 ( 0.00%) 1317.34 * 8.73%* Hmean 8 2465.05 ( 0.00%) 2612.18 * 5.97%* Hmean 16 3194.06 ( 0.00%) 4089.96 * 28.05%* Hmean 32 5984.82 ( 0.00%) 6374.53 * 6.51%* Hmean 64 5017.10 ( 0.00%) 5255.47 * 4.75%* Hmean 96 4705.88 ( 0.00%) 4860.40 * 3.28%*
With this patch, tbench4 is getting better than w/o cluster for both light load and heavy load:
tbench4 tbench tbench 1numa-w/o-cluster 1numa-w-cluster-spread+scancluster+cpus_share_cluster Hmean 1 314.11 ( 0.00%) 331.31 * 5.47%* Hmean 2 612.83 ( 0.00%) 661.35 * 7.92%* Hmean 4 1232.90 ( 0.00%) 1317.34 * 6.85%* Hmean 8 2470.26 ( 0.00%) 2612.18 * 5.74%* Hmean 16 3203.89 ( 0.00%) 4089.96 * 27.66%* Hmean 32 6210.83 ( 0.00%) 6374.53 * 2.64%* Hmean 64 5053.54 ( 0.00%) 5255.47 * 4.00%* Hmean 96 4726.44 ( 0.00%) 4860.40 * 2.83%*
[ Todo: need pgbench, hackbench on 1numa, 4numa; need tbench4 on 4numa ]
Signed-off-by: Barry Song song.bao.hua@hisilicon.com --- kernel/sched/core.c | 5 ++++- kernel/sched/fair.c | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47a4c82..6cfb0fa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3504,7 +3504,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
bool cpus_share_cluster(int this_cpu, int that_cpu) { - return per_cpu(sd_cluster_id, this_cpu) == per_cpu(sd_cluster_id, that_cpu); + if(sched_cluster_active()) + return per_cpu(sd_cluster_id, this_cpu) == per_cpu(sd_cluster_id, that_cpu); + + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); }
static inline bool ttwu_queue_cond(int cpu, int wake_flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdc2cf8..41ebbf2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5797,7 +5797,7 @@ static int wake_wide(struct task_struct *p) * a cpufreq perspective, it's better to have higher utilisation * on one CPU. */ - if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + if (available_idle_cpu(this_cpu) && cpus_share_cluster(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1) @@ -6297,7 +6297,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && + if (prev != target && cpus_share_cluster(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && asym_fits_capacity(task_util, prev)) return prev; @@ -6320,7 +6320,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && + cpus_share_cluster(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && asym_fits_capacity(task_util, recent_used_cpu)) {
linaro-open-discussions@op-lists.linaro.org