 
            On 2021/11/12 3:24, Barry Song wrote:
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:
On 2021/11/11 5:46, Barry Song wrote:
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index ff69f245b939..852a048a5f8c 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >> { >> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >> + struct sched_domain *this_sd, *cluster_sd; >> struct rq *this_rq = this_rq(); >> int this = smp_processor_id(); >> - struct sched_domain *this_sd; >> u64 time = 0; >> >> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >> return -1; >> >> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >> + cpumask_clear_cpu(target, cpus);
May double test if it is necessary to clear the target as target can be idle after it was scanned.
>> + >> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
This line is wrong. should be:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); scan_from = cluster_sd ?cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?
Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
6470 sd = rcu_dereference(per_cpu(sd_llc, target)); 6471 if (!sd) 6472 return target; 6473 6474 if (sched_smt_active()) { 6475 has_idle_core = test_idle_cores(target, false); 6476 6477 if (!has_idle_core && cpus_share_cache(prev, target)) { 6478 i = select_idle_smt(p, sd, prev); 6479 if ((unsigned int)i < nr_cpumask_bits) 6480 return i; 6481 } 6482 } 6483 6484 i = select_idle_cpu(p, sd, has_idle_core, target); 6485 if ((unsigned)i < nr_cpumask_bits) 6486 return i; 6487 6488 return target; 6489 }
>> + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; >> >> if (sched_feat(SIS_PROP) && !has_idle_core) { >> u64 avg_cost, avg_idle, span_avg; >> @@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >> time = cpu_clock(this); >> } >> >> - for_each_cpu_wrap(cpu, cpus, target + 1) { >> + for_each_cpu_wrap(cpu, cpus, scan_from) { >> if (has_idle_core) { >> i = select_idle_core(p, cpu, cpus, &idle_cpu); >> if ((unsigned int)i < nr_cpumask_bits) >> -- >> 2.33.0 >>
Thanks Barry .