Re: [Linaro-open-discussions] [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

19 Nov 2021

...
-----Original Message-----
From: yangyicong
Sent: Friday, November 19, 2021 9:50 PM
To: Barry Song 21cnbao@gmail.com
Cc: yangyicong yangyicong@huawei.com; Tim Chen tim.c.chen@linux.intel.com;
linaro-open-discussions@op-lists.linaro.org; yangyicong
yangyicong@huawei.com; Song Bao Hua (Barry Song)
song.bao.hua@hisilicon.com; Zengtao (B) prime.zeng@hisilicon.com;
Jonathan Cameron jonathan.cameron@huawei.com; shenyang (M)
shenyang39@huawei.com; tangchengchang tangchengchang@huawei.com;
Linuxarm linuxarm@huawei.com
Subject: Re: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if
presents in select_idle_cpu
On 2021/11/19 15:13, Barry Song wrote:
...
On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:
...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:
...
On 2021/11/12 3:24, Barry Song wrote:
...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com
wrote:
...
...
...
...
...
On 2021/11/11 5:46, Barry Song wrote:
>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>> index ff69f245b939..852a048a5f8c 100644
>>>>>> --- a/kernel/sched/fair.c
>>>>>> +++ b/kernel/sched/fair.c
>>>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct
task_struct *p, struct sched_domain *sd
...
...
...
...
...
>>>>>>  static int select_idle_cpu(struct task_struct *p, struct
sched_domain *sd, bool has_idle_core, int target)
...
...
...
...
...
>>>>>>  {
>>>>>>         struct cpumask *cpus =
this_cpu_cpumask_var_ptr(select_idle_mask);
...
...
...
...
...
>>>>>> -       int i, cpu, idle_cpu = -1, nr = INT_MAX;
>>>>>> +       int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
>>>>>> +       struct sched_domain *this_sd, *cluster_sd;
>>>>>>         struct rq *this_rq = this_rq();
>>>>>>         int this = smp_processor_id();
>>>>>> -       struct sched_domain *this_sd;
>>>>>>         u64 time = 0;
>>>>>>
>>>>>>         this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
>>>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct
task_struct *p, struct sched_domain *sd, bool
...
...
...
...
...
>>>>>>                 return -1;
>>>>>>
>>>>>>         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>>>>>> +       cpumask_clear_cpu(target, cpus);
>
> May double test if it is necessary to clear the target as target can
> be idle after it was scanned.
>
>>>>>> +
>>>>>> +       cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
>
> This line is wrong. should be:
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 852a048..0a946ba 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct
> *p, struct sched_domain *sd, bool
>         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>         cpumask_clear_cpu(target, cpus);
>
> -       cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
> +       cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
>         scan_from = cluster_sd ?
> cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
>
> testing needs to be done again with this fix.
>
Since we also need to reconsider which LLC domain we should take?
Currently we take the LLC of this CPU and as tested by @shenyang, there're
cases
...
...
...
...
...
that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu.
we are not taking the LLC of this CPU at all as sd has been set to
LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
I've changed the patch as below:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f24..4918756 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct
task_struct *p, struct sched_domain *sd
 static int select_idle_cpu(struct task_struct *p, struct sched_domain
*sd, bool has_idle_core, int target)
 {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);

  int i, cpu, idle_cpu = -1, nr = INT_MAX;




  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;


  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();




  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));



@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct
*p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);



  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));


  scan_from = cluster_sd ?



cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;



@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct
*p, struct sched_domain *sd, bool
                time = cpu_clock(this);
        }

  for_each_cpu_wrap(cpu, cpus, target + 1) {




  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)





tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/

Hmean     1        248.79 (   0.00%)      209.32 * -15.86%*
Hmean     2        451.59 (   0.00%)      458.94 *   1.63%*
Hmean     4       1124.71 (   0.00%)     1136.36 *   1.04%*
Hmean     8       2121.84 (   0.00%)     2253.71 *   6.21%*
Hmean     16      4244.98 (   0.00%)     4561.48 *   7.46%*
Hmean     32      7405.50 (   0.00%)     8322.98 *  12.39%*
Hmean     64      4057.07 (   0.00%)     4434.10 *   9.29%*
Hmean     128     5796.01 (   0.00%)     6005.13 *   3.61%*
Hmean     256     7085.31 (   0.00%)     7904.76 *  11.57%*
Hmean     512     7531.25 (   0.00%)     7363.00 *  -2.23%*
Hi Yicong,
Will this fix the regression for pgbench, and even further improve
tbench and hackbench?
I'll have a try.
...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4918756..c6ae05f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct
task_struct *p, struct sched_domain *sd
 static int select_idle_cpu(struct task_struct *p, struct sched_domain
*sd, bool has_idle_core, int target)
 {
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);

  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;




  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();



@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct
*p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?

cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  cluster_weight = cluster_sd ?



cpumask_weight(sched_domain_span(cluster_sd)) : 0;
    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;

@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct
*p, struct sched_domain *sd, bool
                else
                        nr = 4;

          /* compensation for CPUs outside the cluster */


          nr += cluster_weight/2;




any certain reason for the divison of 2? or just another heuristic number?
If target is the first cpu of cluster, we actually haven't changed the behavior.
If target is the last cpu of cluster, cpus outside the cluster lose the opportunity
to be scanned in the number of the whole cluster.
So averagely CPUs outside the cluster lose the number of 1/2 cluster. Not quite
sure if this is going to work. For example, while nr=4, system is quite busy,
probably we don't need to do any compensation.
...
...
            time = cpu_clock(this);
    }


Thanks
Barry
Thanks
Barry

    

2025

2024

2023

2022

2021

2020

Re: [Linaro-open-discussions] [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu