On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:
On 2021/11/12 3:24, Barry Song wrote:
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:
On 2021/11/11 5:46, Barry Song wrote:
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>> index ff69f245b939..852a048a5f8c 100644 >>>> --- a/kernel/sched/fair.c >>>> +++ b/kernel/sched/fair.c >>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >>>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >>>> { >>>> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>> + struct sched_domain *this_sd, *cluster_sd; >>>> struct rq *this_rq = this_rq(); >>>> int this = smp_processor_id(); >>>> - struct sched_domain *this_sd; >>>> u64 time = 0; >>>> >>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>> return -1; >>>> >>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>> + cpumask_clear_cpu(target, cpus);
May double test if it is necessary to clear the target as target can be idle after it was scanned.
>>>> + >>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
This line is wrong. should be:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?
Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
struct sched_domain *this_sd, *cluster_sd; struct rq *this_rq = this_rq(); int this = smp_processor_id();
struct sched_domain *this_sd; u64 time = 0; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
for_each_cpu_wrap(cpu, cpus, target + 1) {
for_each_cpu_wrap(cpu, cpus, scan_from) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
Vallin Scancluster Vallina Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight; struct sched_domain *this_sd, *cluster_sd; struct rq *this_rq = this_rq(); int this = smp_processor_id(); @@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; + cluster_weight = cluster_sd ? cpumask_weight(sched_domain_span(cluster_sd)) : 0;
if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
+ /* compensation for CPUs outside the cluster */ + nr += cluster_weight/2; + time = cpu_clock(this); }
Thanks Barry