In wake_affine_idle() and select_idle_sibling(), move to use cpus_share_cluster() rather than cpus_share_cache() to determine if waker and wakee are in same cache domain. This patch mainly benefits light load system.
we are seeing a huge improvement while load is relatively light: (1numa, 24cores, 6 clusters) tbench4 tbench tbench 1numa-w/-cluster-spread-only 1numa-w-cluster-spread+scancluster+cpus_share_cluster Hmean 1 297.22 ( 0.00%) 331.31 * 11.47%* Hmean 2 597.33 ( 0.00%) 661.35 * 10.72%* Hmean 4 1211.53 ( 0.00%) 1317.34 * 8.73%* Hmean 8 2465.05 ( 0.00%) 2612.18 * 5.97%* Hmean 16 3194.06 ( 0.00%) 4089.96 * 28.05%* Hmean 32 5984.82 ( 0.00%) 6374.53 * 6.51%* Hmean 64 5017.10 ( 0.00%) 5255.47 * 4.75%* Hmean 96 4705.88 ( 0.00%) 4860.40 * 3.28%*
With this patch, tbench4 is getting better than w/o cluster for both light load and heavy load:
tbench4 tbench tbench 1numa-w/o-cluster 1numa-w-cluster-spread+scancluster+cpus_share_cluster Hmean 1 314.11 ( 0.00%) 331.31 * 5.47%* Hmean 2 612.83 ( 0.00%) 661.35 * 7.92%* Hmean 4 1232.90 ( 0.00%) 1317.34 * 6.85%* Hmean 8 2470.26 ( 0.00%) 2612.18 * 5.74%* Hmean 16 3203.89 ( 0.00%) 4089.96 * 27.66%* Hmean 32 6210.83 ( 0.00%) 6374.53 * 2.64%* Hmean 64 5053.54 ( 0.00%) 5255.47 * 4.00%* Hmean 96 4726.44 ( 0.00%) 4860.40 * 2.83%*
[ Todo: need pgbench, hackbench on 1numa, 4numa; need tbench4 on 4numa ]
Signed-off-by: Barry Song song.bao.hua@hisilicon.com --- kernel/sched/core.c | 5 ++++- kernel/sched/fair.c | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47a4c82..6cfb0fa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3504,7 +3504,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
bool cpus_share_cluster(int this_cpu, int that_cpu) { - return per_cpu(sd_cluster_id, this_cpu) == per_cpu(sd_cluster_id, that_cpu); + if(sched_cluster_active()) + return per_cpu(sd_cluster_id, this_cpu) == per_cpu(sd_cluster_id, that_cpu); + + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); }
static inline bool ttwu_queue_cond(int cpu, int wake_flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdc2cf8..41ebbf2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5797,7 +5797,7 @@ static int wake_wide(struct task_struct *p) * a cpufreq perspective, it's better to have higher utilisation * on one CPU. */ - if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + if (available_idle_cpu(this_cpu) && cpus_share_cluster(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1) @@ -6297,7 +6297,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && + if (prev != target && cpus_share_cluster(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && asym_fits_capacity(task_util, prev)) return prev; @@ -6320,7 +6320,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && recent_used_cpu != target && - cpus_share_cache(recent_used_cpu, target) && + cpus_share_cluster(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && asym_fits_capacity(task_util, recent_used_cpu)) {