From: Barry Song song.bao.hua@hisilicon.com
For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, wake the task within the same cluster with the waker will make migration cost smaller. This patch tries to find a wake cpu by scanning the cluster first before scanning LLC.
Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA. And the results from tbench and hackbench are rather positive.
tbench4 5.16-rc1-vanilla 5.16-rc1+patch Hmean 1 341.78 ( 0.00%) 350.10 * 2.43%* Hmean 2 684.31 ( 0.00%) 700.25 * 2.33%* Hmean 4 1350.03 ( 0.00%) 1374.33 * 1.80%* Hmean 8 2563.33 ( 0.00%) 2615.74 * 2.04%* Hmean 16 4976.31 ( 0.00%) 4911.05 * -1.31%* Hmean 32 8446.80 ( 0.00%) 9076.71 * 7.46%* Hmean 64 4938.98 ( 0.00%) 5890.29 * 19.26%* Hmean 128 7422.75 ( 0.00%) 8941.65 * 20.46%* Hmean 256 7503.72 ( 0.00%) 7609.30 * 1.41%* Hmean 512 6526.50 ( 0.00%) 7616.90 * 16.71%*
hackbench-process-pipes 5.16-rc1-vanilla 5.16-rc1+patch Amean 1 0.7233 ( 0.00%) 0.6048 * 16.38%* Amean 4 1.6168 ( 0.00%) 0.9831 * 39.19%* Amean 7 1.7604 ( 0.00%) 1.3456 * 23.56%* Amean 12 2.1637 ( 0.00%) 2.0515 * 5.19%* Amean 21 3.7302 ( 0.00%) 3.4755 * 6.83%* Amean 30 6.8281 ( 0.00%) 5.4964 * 19.50%* Amean 48 11.5442 ( 0.00%) 9.2672 * 19.72%* Amean 79 14.1319 ( 0.00%) 12.1617 * 13.94%* Amean 110 17.2689 ( 0.00%) 15.0081 * 13.09%* Amean 141 20.2057 ( 0.00%) 18.4041 * 8.92%* Amean 172 25.2087 ( 0.00%) 21.2069 * 15.87%* Amean 203 28.4038 ( 0.00%) 24.8319 * 12.58%* Amean 234 32.4690 ( 0.00%) 28.2500 * 12.99%* Amean 256 33.1803 ( 0.00%) 30.0114 * 9.55%*
Tested-by: Yicong Yang yangyicong@hisilicon.com Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com --- kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e476f6d9435..f8b094738c03 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6230,6 +6230,34 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_CLUSTER +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu; + + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + cpumask_clear_cpu(target, cpus); + + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) + i = select_idle_core(p, cpu, cpus, idle_cpu); + else + i = __select_idle_cpu(cpu, p); + + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + + return -1; +} +#else +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + return -1; +} +#endif + /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -6241,14 +6269,25 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool int i, cpu, idle_cpu = -1, nr = INT_MAX; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd, *cluster_sd; u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1;
+ /* scan cluster before scanning LLC */ + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); + if (cluster_sd) { + i = scan_cluster(p, cluster_sd, has_idle_core, target, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } + + /* scan LLC excluding cluster */ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (cluster_sd) + cpumask_andnot(cpus, cpus, sched_domain_span(cluster_sd));
if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg;