[Linaro-open-discussions] [PATCH 3/4] sched/fair: Scan cluster before scanning llc in wake-up path

24 Jun 2021

if we only do spreading in patchset[1], for workload who is sensitive to
wake-up path and cache affinity, like tbench4, we are seeing performance
degradation:
(one numa, 24cores, 6 clusters)
tbench4
                             tbench                 tbench
                 1numa-w/o-cluster     1numa-w/-cluster-spread-only
Hmean     1       314.11 (   0.00%)      297.22 *  -5.38%*
Hmean     2       612.83 (   0.00%)      597.33 *  -2.53%*
Hmean     4      1232.90 (   0.00%)     1211.53 *  -1.73%*
Hmean     8      2470.26 (   0.00%)     2465.05 *  -0.21%*
Hmean     16     3203.89 (   0.00%)     3194.06 *  -0.31%*
Hmean     32     6210.83 (   0.00%)     5984.82 *  -3.64%*
Hmean     64     5053.54 (   0.00%)     5017.10 *  -0.72%*
Hmean     96     4726.44 (   0.00%)     4705.88 *  -0.44%*
This patch moves to scan cluster before scanning LLC if waker and wakee
are in different LLCs; if they are in the same LLC, this patch moves to
scan cluster only unless llc's has_idle_core is true.
With this patch to change scanning path, we are seeing obvious improvement
on tbench4 while system load is relatively heavy:
(one numa, 24cores, 6 clusters)
tbench4
                             tbench                 tbench
              1numa-w/-cluster-spread-only 1numa-w/-cluster-spread+scancluster
Hmean     1       297.22 (   0.00%)      297.40 (   0.06%)
Hmean     2       597.33 (   0.00%)      597.32 (  -0.00%)
Hmean     4      1211.53 (   0.00%)     1225.72 *   1.17%*
Hmean     8      2465.05 (   0.00%)     2508.96 *   1.78%*
Hmean     16     3194.06 (   0.00%)     4029.90 *  26.17%*
Hmean     32     5984.82 (   0.00%)     6376.05 *   6.54%*
Hmean     64     5017.10 (   0.00%)     5250.62 *   4.65%*
Hmean     96     4705.88 (   0.00%)     4882.62 *   3.76%*
but while system load is relatively light, cluster-scheduler with this
patch is still making worse performance, compared to the system without
cluster-scheduler:
(one numa, 24cores, 6 clusters)
tbench4
                             tbench                 tbench
               1numa-w/o-cluster         1numa-w/-cluster-spread+scancluster
Hmean     1       314.11 (   0.00%)      297.40 *  -5.32%*
Hmean     2       612.83 (   0.00%)      597.32 *  -2.53%*
Hmean     4      1232.90 (   0.00%)     1225.72 *  -0.58%*
Hmean     8      2470.26 (   0.00%)     2508.96 *   1.57%*
Hmean     16     3203.89 (   0.00%)     4029.90 *  25.78%*
Hmean     32     6210.83 (   0.00%)     6376.05 *   2.66%*
Hmean     64     5053.54 (   0.00%)     5250.62 *   3.90%*
Hmean     96     4726.44 (   0.00%)     4882.62 *   3.30%*
[
Hi Tim,
I don't have a machine with both cluster and smt to test. if you are the case,
pls feel free to edit select_idle_cluster and select_idle_sibling
According to https://en.wikipedia.org/wiki/Tremont_(microarchitecture)#Processors_for_bas...)
it seems you aren't smt too?
Thanks
Barry
]
[1]https://op-lists.linaro.org/pipermail/linaro-open-discussions/2021-June/0001...
[
  Todo:
  need pgbench, hackbench on 1numa, 4numa; need tbench4 on 4numa
]
Signed-off-by: Barry Song song.bao.hua@hisilicon.com
---
 kernel/sched/fair.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8578cb1..fdc2cf8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6132,6 +6132,32 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
 #endif /* CONFIG_SCHED_SMT */
/*
+ * Scan the cluster domain for idle CPUs
+ */
+static int select_idle_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
+{
+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+	int i, cpu, idle_cpu = -1;
+
+	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+
+	for_each_cpu_wrap(cpu, cpus, target) {
+		if (has_idle_core) {
+			i = select_idle_core(p, cpu, cpus, &idle_cpu);
+			if ((unsigned int)i < nr_cpumask_bits)
+				return i;
+
+		} else {
+			idle_cpu = __select_idle_cpu(cpu, p);
+			if ((unsigned int)idle_cpu < nr_cpumask_bits)
+				break;
+		}
+	}
+
+	return idle_cpu;
+}
+
+/*
  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
@@ -6141,7 +6167,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
    int i, cpu, idle_cpu = -1, nr = INT_MAX;
    int this = smp_processor_id();
-	struct sched_domain *this_sd;
+	struct sched_domain *this_sd, *cluster_sd;
    u64 time;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6149,6 +6175,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    	return -1;
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+	cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
+	if (cluster_sd)
+		cpumask_andnot(cpus, cpus, sched_domain_span(cluster_sd));
if (sched_feat(SIS_PROP) && !has_idle_core) {
    	u64 avg_cost, avg_idle, span_avg;
@@ -6337,6 +6366,23 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
    	}
    }
+	if (sched_cluster_active()) {
+		struct sched_domain *cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
+		if (cluster_sd) {
+			i = select_idle_cluster(p, cluster_sd, has_idle_core, target);
+			if ((unsigned)i < nr_cpumask_bits)
+				return i;
+			/*
+			 * if prev and target are not in same LLC, give other cpus who have
+			 * same LLC with target one chance as they are closer than target
+			 * though they are not the closest; otherwise, no need to scan LLC;
+			 * for smt, we always select idle core in the whole LLC
+			 */
+			if (cpus_share_cache(prev, target) && !has_idle_core)
+				return target;
+		}
+	}
+
    i = select_idle_cpu(p, sd, has_idle_core, target);
    if ((unsigned)i < nr_cpumask_bits)
    	return i;
-- 
1.8.3.1


    

2025

2024

2023

2022

2021

2020

[Linaro-open-discussions] [PATCH 3/4] sched/fair: Scan cluster before scanning llc in wake-up path