[PATCH 0/2] Wake task within the same cluster of waker if possible

List overview All Threads
Download

newer

older

Agenda for LOD Meeting Oct 25th

SCMI protocol for interconnect...

Yicong Yang

28 Oct 2021 28 Oct '21

9:18 p.m.

This is the follow-up work to support cluster scheduler. Previously we have added cluster level in the scheduler[1] to make tasks spread between clusters to bring more memory bandwidth and decrease cache contention. But it may hurt some workloads which are sensitive to the communication latency as they will be placed across clusters.

We modified the select_idle_cpu() on the wake affine path in this series, expecting the wake affined task to be woken more likely on the same cluster with the waker. The latency will be decreased as the waker and wakee in the same cluster may benefit from the hot L3 cache tag.

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

Hi Tim and Barry, This the modified patch of packing path of cluster scheduler and tests have been done on Kunpeng 920 2-socket 4-NUMA 128core platform, with 8 clusters on each NUMA. Patches based on 5.15-rc1.

Barry Song (2): sched: Add per_cpu cluster domain info sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

-- 2.33.0

Show replies by date

Yicong Yang

28 Oct 28 Oct

9:18 p.m.

New subject: [PATCH 1/2] sched: Add per_cpu cluster domain info

From: Barry Song song.bao.hua@hisilicon.com

Add per-cpu cluster domain info. This is the preparation for optimization of select_idle_cpu() on platforms with cluster scheduler level.

Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com --- include/linux/sched/sd_flags.h | 9 +++++++++ include/linux/sched/topology.h | 2 +- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..656473a17904 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,15 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

+/* + * Domain members share CPU cluster resources (i.e. llc cache tags or l2) + * + * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share + * the cluster resources (such as llc tags or l2) + * NEEDS_GROUPS: Caches are shared between groups. + */ +SD_FLAG(SD_SHARE_CLS_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..846fcac1f752 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CLS_RESOURCES | SD_SHARE_PKG_RESOURCES; } #endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d3e5793e117..84028b146d9b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1778,6 +1778,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7d27559485ea..b745ce856f58 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -657,6 +658,9 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1;

+ sd = highest_flag_domain(cpu, SD_SHARE_CLS_RESOURCES); + rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd); + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); if (sd) { id = cpumask_first(sched_domain_span(sd)); @@ -1504,6 +1508,7 @@ static unsigned long __read_mostly *sched_numa_onlined_nodes; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_CLS_RESOURCES | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING)

-- 2.33.0

Barry Song

4 Nov 4 Nov

11:17 p.m.

New subject: [PATCH 1/2] sched: Add per_cpu cluster domain info

On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...

From: Barry Song song.bao.hua@hisilicon.com

Add per-cpu cluster domain info. This is the preparation for optimization of select_idle_cpu() on platforms with cluster scheduler level.

Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

include/linux/sched/sd_flags.h | 9 +++++++++ include/linux/sched/topology.h | 2 +- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..656473a17904 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,15 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

+/*
Domain members share CPU cluster resources (i.e. llc cache tags or l2)

SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
          the cluster resources (such as llc tags or l2)
NEEDS_GROUPS: Caches are shared between groups.

*/
+SD_FLAG(SD_SHARE_CLS_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*

Domain members share CPU package resources (i.e. caches)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..846fcac1f752 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) {
  return SD_SHARE_PKG_RESOURCES;
  return SD_SHARE_CLS_RESOURCES | SD_SHARE_PKG_RESOURCES;
} #endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d3e5793e117..84028b146d9b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1778,6 +1778,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster);

if we don't enable SD_CLUSTER, we won't have this domain? so we don't need this per-cpu variable?

...

DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7d27559485ea..b745ce856f58 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -657,6 +658,9 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1;
  sd = highest_flag_domain(cpu, SD_SHARE_CLS_RESOURCES);
  rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd);

here too.

...

  sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
  if (sd) {
          id = cpumask_first(sched_domain_span(sd));
@@ -1504,6 +1508,7 @@ static unsigned long __read_mostly *sched_numa_onlined_nodes; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \
   SD_SHARE_CLS_RESOURCES |       \
   SD_SHARE_PKG_RESOURCES |       \
   SD_NUMA                |       \
   SD_ASYM_PACKING)
-- 2.33.0

Thanks barry

Yicong Yang

5 Nov 5 Nov

9:27 p.m.

New subject: [PATCH 1/2] sched: Add per_cpu cluster domain info

On 2021/11/4 18:17, Barry Song wrote:

...

On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
From: Barry Song song.bao.hua@hisilicon.com

Add per-cpu cluster domain info. This is the preparation for optimization of select_idle_cpu() on platforms with cluster scheduler level.

Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

include/linux/sched/sd_flags.h | 9 +++++++++ include/linux/sched/topology.h | 2 +- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..656473a17904 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,15 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

+/*
Domain members share CPU cluster resources (i.e. llc cache tags or l2)

SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
          the cluster resources (such as llc tags or l2)
NEEDS_GROUPS: Caches are shared between groups.

*/
+SD_FLAG(SD_SHARE_CLS_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)

/*

Domain members share CPU package resources (i.e. caches)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 2f9166f6dec8..846fcac1f752 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) {
  return SD_SHARE_PKG_RESOURCES;
  return SD_SHARE_CLS_RESOURCES | SD_SHARE_PKG_RESOURCES;
} #endif

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3d3e5793e117..84028b146d9b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1778,6 +1778,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_cluster);
if we don't enable SD_CLUSTER, we won't have this domain? so we don't need this per-cpu variable?

Maybe unnecessary? I see sd_numa below is not protected by the macro like #if CONFIG_NUMA.

...

...
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7d27559485ea..b745ce856f58 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_cluster); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -657,6 +658,9 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1;
  sd = highest_flag_domain(cpu, SD_SHARE_CLS_RESOURCES);
  rcu_assign_pointer(per_cpu(sd_cluster, cpu), sd);
here too.

...
  sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
  if (sd) {
          id = cpumask_first(sched_domain_span(sd));
@@ -1504,6 +1508,7 @@ static unsigned long __read_mostly *sched_numa_onlined_nodes; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \
   SD_SHARE_CLS_RESOURCES |       \
   SD_SHARE_PKG_RESOURCES |       \
   SD_NUMA                |       \
   SD_ASYM_PACKING)
-- 2.33.0
Thanks barry .

Yicong Yang

28 Oct 28 Oct

9:18 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

From: Barry Song song.bao.hua@hisilicon.com

For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, this patch moves to change the begin cpu of scanning in select_idle_cpu() from the next cpu of target to the first cpu of the target's cluster. Then the search will perform within the cluster first and we'll have more chance to wake the wakee in the same cluster of the waker.

Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA and on NUMA 0. Improvements are observed in most cases compared to 5.15-rc1 with cluster scheduler level[1].

hackbench-process-pipes 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.6136 ( 0.00%) 0.5988 ( 2.41%) Amean 4 0.8380 ( 0.00%) 0.8904 * -6.25%* Amean 7 1.1661 ( 0.00%) 1.1017 * 5.52%* Amean 12 1.4670 ( 0.00%) 1.5994 * -9.03%* Amean 21 2.8909 ( 0.00%) 2.8640 ( 0.93%) Amean 30 4.3943 ( 0.00%) 4.2052 ( 4.30%) Amean 48 6.6870 ( 0.00%) 6.4079 ( 4.17%) Amean 79 10.4796 ( 0.00%) 9.5507 * 8.86%* Amean 110 14.5310 ( 0.00%) 12.2114 * 15.96%* Amean 141 16.4772 ( 0.00%) 14.1517 * 14.11%* Amean 172 20.0868 ( 0.00%) 15.9852 * 20.42%* Amean 203 22.9282 ( 0.00%) 18.4574 * 19.50%* Amean 234 25.8139 ( 0.00%) 20.4725 * 20.69%* Amean 256 27.6834 ( 0.00%) 22.9076 * 17.25%*

tbench4 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 338.50 ( 0.00%) 345.47 * 2.06%* Hmean 2 672.20 ( 0.00%) 695.10 * 3.41%* Hmean 4 1329.03 ( 0.00%) 1357.40 * 2.14%* Hmean 8 2513.25 ( 0.00%) 2419.88 * -3.71%* Hmean 16 4957.39 ( 0.00%) 4882.04 * -1.52%* Hmean 32 8737.07 ( 0.00%) 8649.97 * -1.00%* Hmean 64 4929.31 ( 0.00%) 6570.13 * 33.29%* Hmean 128 5052.75 ( 0.00%) 8157.96 * 61.46%* Hmean 256 6971.70 ( 0.00%) 7648.01 * 9.70%* Hmean 512 7427.32 ( 0.00%) 7450.68 * 0.31%*

tbench4 NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 318.98 ( 0.00%) 322.53 * 1.11%* Hmean 2 640.50 ( 0.00%) 641.89 * 0.22%* Hmean 4 1277.57 ( 0.00%) 1292.54 * 1.17%* Hmean 8 2584.55 ( 0.00%) 2622.64 * 1.47%* Hmean 16 5245.05 ( 0.00%) 5440.75 * 3.73%* Hmean 32 3231.60 ( 0.00%) 3991.83 * 23.52%* Hmean 64 7361.28 ( 0.00%) 7356.56 ( -0.06%) Hmean 128 6240.28 ( 0.00%) 6293.78 * 0.86%*

hackbench-process-pipes NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.5196 ( 0.00%) 0.5121 ( 1.44%) Amean 4 1.0946 ( 0.00%) 1.3234 * -20.90%* Amean 7 1.9368 ( 0.00%) 2.4304 * -25.49%* Amean 12 3.4168 ( 0.00%) 3.6422 * -6.60%* Amean 21 6.1119 ( 0.00%) 5.5032 * 9.96%* Amean 30 7.8980 ( 0.00%) 7.5433 * 4.49%* Amean 48 11.2969 ( 0.00%) 10.6889 * 5.38%* Amean 79 17.3220 ( 0.00%) 15.2553 * 11.93%* Amean 110 22.9893 ( 0.00%) 19.8521 * 13.65%* Amean 141 28.5319 ( 0.00%) 24.9064 * 12.71%* Amean 172 34.1731 ( 0.00%) 30.8424 * 9.75%* Amean 203 39.9368 ( 0.00%) 35.4607 * 11.21%* Amean 234 45.6207 ( 0.00%) 40.4969 * 11.23%* Amean 256 50.0725 ( 0.00%) 45.0295 * 10.07%*

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int i, cpu, idle_cpu = -1, nr = INT_MAX; + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; + struct sched_domain *this_sd, *cluster_sd; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; u64 time = 0;

this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;

cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + cpumask_clear_cpu(target, cpus); + + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }

- for_each_cpu_wrap(cpu, cpus, target + 1) { + for_each_cpu_wrap(cpu, cpus, scan_from) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits)

-- 2.33.0

Barry Song

4 Nov 4 Nov

11:39 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...

From: Barry Song song.bao.hua@hisilicon.com

For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, this patch moves to change the begin cpu of scanning in select_idle_cpu() from the next cpu of target to the first cpu of the target's cluster. Then the search will perform within the cluster first and we'll have more chance to wake the wakee in the same cluster of the waker.

Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA and on NUMA 0. Improvements are observed in most cases compared to 5.15-rc1 with cluster scheduler level[1].

hackbench-process-pipes 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.6136 ( 0.00%) 0.5988 ( 2.41%) Amean 4 0.8380 ( 0.00%) 0.8904 * -6.25%* Amean 7 1.1661 ( 0.00%) 1.1017 * 5.52%* Amean 12 1.4670 ( 0.00%) 1.5994 * -9.03%* Amean 21 2.8909 ( 0.00%) 2.8640 ( 0.93%) Amean 30 4.3943 ( 0.00%) 4.2052 ( 4.30%) Amean 48 6.6870 ( 0.00%) 6.4079 ( 4.17%) Amean 79 10.4796 ( 0.00%) 9.5507 * 8.86%* Amean 110 14.5310 ( 0.00%) 12.2114 * 15.96%* Amean 141 16.4772 ( 0.00%) 14.1517 * 14.11%* Amean 172 20.0868 ( 0.00%) 15.9852 * 20.42%* Amean 203 22.9282 ( 0.00%) 18.4574 * 19.50%* Amean 234 25.8139 ( 0.00%) 20.4725 * 20.69%* Amean 256 27.6834 ( 0.00%) 22.9076 * 17.25%*

tbench4 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 338.50 ( 0.00%) 345.47 * 2.06%* Hmean 2 672.20 ( 0.00%) 695.10 * 3.41%* Hmean 4 1329.03 ( 0.00%) 1357.40 * 2.14%* Hmean 8 2513.25 ( 0.00%) 2419.88 * -3.71%* Hmean 16 4957.39 ( 0.00%) 4882.04 * -1.52%* Hmean 32 8737.07 ( 0.00%) 8649.97 * -1.00%* Hmean 64 4929.31 ( 0.00%) 6570.13 * 33.29%* Hmean 128 5052.75 ( 0.00%) 8157.96 * 61.46%* Hmean 256 6971.70 ( 0.00%) 7648.01 * 9.70%* Hmean 512 7427.32 ( 0.00%) 7450.68 * 0.31%*

tbench4 NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 318.98 ( 0.00%) 322.53 * 1.11%* Hmean 2 640.50 ( 0.00%) 641.89 * 0.22%* Hmean 4 1277.57 ( 0.00%) 1292.54 * 1.17%* Hmean 8 2584.55 ( 0.00%) 2622.64 * 1.47%* Hmean 16 5245.05 ( 0.00%) 5440.75 * 3.73%* Hmean 32 3231.60 ( 0.00%) 3991.83 * 23.52%* Hmean 64 7361.28 ( 0.00%) 7356.56 ( -0.06%) Hmean 128 6240.28 ( 0.00%) 6293.78 * 0.86%*

hackbench-process-pipes NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.5196 ( 0.00%) 0.5121 ( 1.44%) Amean 4 1.0946 ( 0.00%) 1.3234 * -20.90%* Amean 7 1.9368 ( 0.00%) 2.4304 * -25.49%* Amean 12 3.4168 ( 0.00%) 3.6422 * -6.60%* Amean 21 6.1119 ( 0.00%) 5.5032 * 9.96%* Amean 30 7.8980 ( 0.00%) 7.5433 * 4.49%* Amean 48 11.2969 ( 0.00%) 10.6889 * 5.38%* Amean 79 17.3220 ( 0.00%) 15.2553 * 11.93%* Amean 110 22.9893 ( 0.00%) 19.8521 * 13.65%* Amean 141 28.5319 ( 0.00%) 24.9064 * 12.71%* Amean 172 34.1731 ( 0.00%) 30.8424 * 9.75%* Amean 203 39.9368 ( 0.00%) 35.4607 * 11.21%* Amean 234 45.6207 ( 0.00%) 40.4969 * 11.23%* Amean 256 50.0725 ( 0.00%) 45.0295 * 10.07%*

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

the patchset is causing a kernel panic during kexec reboot:

[ 1254.167993] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000120 [ 1254.176771] Mem abort info: [ 1254.179551] ESR = 0x96000004 [ 1254.182596] EC = 0x25: DABT (current EL), IL = 32 bits [ 1254.187899] SET = 0, FnV = 0 [ 1254.190944] EA = 0, S1PTW = 0 [ 1254.194076] FSC = 0x04: level 0 translation fault [ 1254.198944] Data abort info: [ 1254.201815] ISV = 0, ISS = 0x00000004 [ 1254.205643] CM = 0, WnR = 0 [ 1254.208604] user pgtable: 4k pages16] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 1254.227375] Modules linked in: [ 1254.230416] CPU: 0 PID: 786 Comm: kworker/1:2 Not tainted 5.15.0-rc1-00005-g4c1b4a4d90b6-dirty #302 [ 1254.239447] Hardware name: Huawei XA320 V2 /BC82HPNBB, BIOS 0.86 07/19/2019 [ 1254.246393] Workqueue: events cpuset_hotplug_workfn [ 1254.251263] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1254.258211] pc : __bitmap_weight+0x30/0x90 [ 1254.262297] lr : cpu_attach_domain+0x1ec/0x838 [ 1254.266729] sp : ffff8000238fba10 [ 1254.270029] x29: ffff8000238fba10 x28: ffff204000059f00 x27: 0000000000000000 [ 1254.277151] x26: ffff800010e3a238 x25: 0000000000000001 x24: ffff8000117858f0 [ 1254.284274] x23: 0000000000000100 x22: 0000000000000004 x21: 0000000000000120 [ 1254.291395] x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000001 [ 1254.298517] x17: 0000000000000000 x16: 00000000000006d4 x15: 00000000000006d1 [ 1254.305639] x14: 0000000000000002 x13: 0000000000000000 x12: 0000000000000000 [ 1254.312760] x11: 00000000000000c0 x10: 0000000000000a80 x9 : 0000000000000001 [ 1254.319882] x8 : ffff002080410000 x7 : 0000000000000000 x6 : 0000000000000000 [ 1254.327004] x5 : ffff800011f60b00 x4 : 00000000002dc6c0 x3 : ffff803f6e3fd000 [ 1254.334126] x2 : 0000000000000000 x1 : 0000000000000100 x0 : 0000000000000120 [ 1254.341247] Call trace: [ 1254.343680] __bitmap_weight+0x30/0x90 [ 1254.347416] cpu_attach_domain+0x1ec/0x838 [ 1254.351499] partition_sched_domains_locked+0x12c/0x908 [ 1254.356711] rebuild_sched_domains_locked+0x384/0x800 [ 1254.361749] rebuild_sched_domains+0x24/0x40 [ 1254.366006] cpuset_hotplug_workfn+0x34c/0x548 [ 1254.370437] process_one_work+0x1bc/0x338 [ 1254.374433] worker_thread+0x48/0x418 [ 1254.378081] kthread+0x14c/0x158 [ 1254.381297] ret_from_fork+0x10/0x20 [ 1254.384861] Code: 2a0103f7 54000300 d2800013 52800014 (f8737aa0) [ 1254.390940] ---[ end trace 179fc74a465f3bec ]---

...

Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0

Barry Song

5 Nov 5 Nov

1:12 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Thu, Nov 4, 2021 at 11:39 PM Barry Song 21cnbao@gmail.com wrote:

...

On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
From: Barry Song song.bao.hua@hisilicon.com

For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, this patch moves to change the begin cpu of scanning in select_idle_cpu() from the next cpu of target to the first cpu of the target's cluster. Then the search will perform within the cluster first and we'll have more chance to wake the wakee in the same cluster of the waker.

Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA and on NUMA 0. Improvements are observed in most cases compared to 5.15-rc1 with cluster scheduler level[1].

hackbench-process-pipes 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.6136 ( 0.00%) 0.5988 ( 2.41%) Amean 4 0.8380 ( 0.00%) 0.8904 * -6.25%* Amean 7 1.1661 ( 0.00%) 1.1017 * 5.52%* Amean 12 1.4670 ( 0.00%) 1.5994 * -9.03%* Amean 21 2.8909 ( 0.00%) 2.8640 ( 0.93%) Amean 30 4.3943 ( 0.00%) 4.2052 ( 4.30%) Amean 48 6.6870 ( 0.00%) 6.4079 ( 4.17%) Amean 79 10.4796 ( 0.00%) 9.5507 * 8.86%* Amean 110 14.5310 ( 0.00%) 12.2114 * 15.96%* Amean 141 16.4772 ( 0.00%) 14.1517 * 14.11%* Amean 172 20.0868 ( 0.00%) 15.9852 * 20.42%* Amean 203 22.9282 ( 0.00%) 18.4574 * 19.50%* Amean 234 25.8139 ( 0.00%) 20.4725 * 20.69%* Amean 256 27.6834 ( 0.00%) 22.9076 * 17.25%*

tbench4 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 338.50 ( 0.00%) 345.47 * 2.06%* Hmean 2 672.20 ( 0.00%) 695.10 * 3.41%* Hmean 4 1329.03 ( 0.00%) 1357.40 * 2.14%* Hmean 8 2513.25 ( 0.00%) 2419.88 * -3.71%* Hmean 16 4957.39 ( 0.00%) 4882.04 * -1.52%* Hmean 32 8737.07 ( 0.00%) 8649.97 * -1.00%* Hmean 64 4929.31 ( 0.00%) 6570.13 * 33.29%* Hmean 128 5052.75 ( 0.00%) 8157.96 * 61.46%* Hmean 256 6971.70 ( 0.00%) 7648.01 * 9.70%* Hmean 512 7427.32 ( 0.00%) 7450.68 * 0.31%*

tbench4 NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 318.98 ( 0.00%) 322.53 * 1.11%* Hmean 2 640.50 ( 0.00%) 641.89 * 0.22%* Hmean 4 1277.57 ( 0.00%) 1292.54 * 1.17%* Hmean 8 2584.55 ( 0.00%) 2622.64 * 1.47%* Hmean 16 5245.05 ( 0.00%) 5440.75 * 3.73%* Hmean 32 3231.60 ( 0.00%) 3991.83 * 23.52%* Hmean 64 7361.28 ( 0.00%) 7356.56 ( -0.06%) Hmean 128 6240.28 ( 0.00%) 6293.78 * 0.86%*

hackbench-process-pipes NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.5196 ( 0.00%) 0.5121 ( 1.44%) Amean 4 1.0946 ( 0.00%) 1.3234 * -20.90%* Amean 7 1.9368 ( 0.00%) 2.4304 * -25.49%* Amean 12 3.4168 ( 0.00%) 3.6422 * -6.60%* Amean 21 6.1119 ( 0.00%) 5.5032 * 9.96%* Amean 30 7.8980 ( 0.00%) 7.5433 * 4.49%* Amean 48 11.2969 ( 0.00%) 10.6889 * 5.38%* Amean 79 17.3220 ( 0.00%) 15.2553 * 11.93%* Amean 110 22.9893 ( 0.00%) 19.8521 * 13.65%* Amean 141 28.5319 ( 0.00%) 24.9064 * 12.71%* Amean 172 34.1731 ( 0.00%) 30.8424 * 9.75%* Amean 203 39.9368 ( 0.00%) 35.4607 * 11.21%* Amean 234 45.6207 ( 0.00%) 40.4969 * 11.23%* Amean 256 50.0725 ( 0.00%) 45.0295 * 10.07%*

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

the patchset is causing a kernel panic during kexec reboot:

[ 1254.167993] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000120 [ 1254.176771] Mem abort info: [ 1254.179551] ESR = 0x96000004 [ 1254.182596] EC = 0x25: DABT (current EL), IL = 32 bits [ 1254.187899] SET = 0, FnV = 0 [ 1254.190944] EA = 0, S1PTW = 0 [ 1254.194076] FSC = 0x04: level 0 translation fault [ 1254.198944] Data abort info: [ 1254.201815] ISV = 0, ISS = 0x00000004 [ 1254.205643] CM = 0, WnR = 0 [ 1254.208604] user pgtable: 4k pages16] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 1254.227375] Modules linked in: [ 1254.230416] CPU: 0 PID: 786 Comm: kworker/1:2 Not tainted 5.15.0-rc1-00005-g4c1b4a4d90b6-dirty #302 [ 1254.239447] Hardware name: Huawei XA320 V2 /BC82HPNBB, BIOS 0.86 07/19/2019 [ 1254.246393] Workqueue: events cpuset_hotplug_workfn [ 1254.251263] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1254.258211] pc : __bitmap_weight+0x30/0x90 [ 1254.262297] lr : cpu_attach_domain+0x1ec/0x838 [ 1254.266729] sp : ffff8000238fba10 [ 1254.270029] x29: ffff8000238fba10 x28: ffff204000059f00 x27: 0000000000000000 [ 1254.277151] x26: ffff800010e3a238 x25: 0000000000000001 x24: ffff8000117858f0 [ 1254.284274] x23: 0000000000000100 x22: 0000000000000004 x21: 0000000000000120 [ 1254.291395] x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000001 [ 1254.298517] x17: 0000000000000000 x16: 00000000000006d4 x15: 00000000000006d1 [ 1254.305639] x14: 0000000000000002 x13: 0000000000000000 x12: 0000000000000000 [ 1254.312760] x11: 00000000000000c0 x10: 0000000000000a80 x9 : 0000000000000001 [ 1254.319882] x8 : ffff002080410000 x7 : 0000000000000000 x6 : 0000000000000000 [ 1254.327004] x5 : ffff800011f60b00 x4 : 00000000002dc6c0 x3 : ffff803f6e3fd000 [ 1254.334126] x2 : 0000000000000000 x1 : 0000000000000100 x0 : 0000000000000120 [ 1254.341247] Call trace: [ 1254.343680] __bitmap_weight+0x30/0x90 [ 1254.347416] cpu_attach_domain+0x1ec/0x838 [ 1254.351499] partition_sched_domains_locked+0x12c/0x908 [ 1254.356711] rebuild_sched_domains_locked+0x384/0x800 [ 1254.361749] rebuild_sched_domains+0x24/0x40 [ 1254.366006] cpuset_hotplug_workfn+0x34c/0x548 [ 1254.370437] process_one_work+0x1bc/0x338 [ 1254.374433] worker_thread+0x48/0x418 [ 1254.378081] kthread+0x14c/0x158 [ 1254.381297] ret_from_fork+0x10/0x20 [ 1254.384861] Code: 2a0103f7 54000300 d2800013 52800014 (f8737aa0) [ 1254.390940] ---[ end trace 179fc74a465f3bec ]---

sorry. pls ignore the noise. it was made by my local debug code.

one benchmark result:

running sysbench on numa0-1(cpu0-cpu63), and running mysqld on numa2-3(cpu64-127)

sysbench command as below: numactl -C 0-63 sysbench --db-driver=mysql --mysql-user=sbtest_user \ --mysql_password=password --mysql-db=sbtest --mysql-host=127.0.0.1 \ --mysql-port=3306 --point-selects=10 --simple-ranges=1 --sum-ranges=1 \ --order-ranges=1 --distinct-ranges=1 --delete_inserts=1 --index-updates=1 \ --non-index-updates=1 --delete-inserts=1 --range-size=100 --time=600 \ --events=0 --report-interval=60 --tables=64 --table-size=2000000 \ --threads=64 /usr/share/sysbench/oltp_write_only.lua run

w/o patchset w/ patchset tps 53325.97 52331.69 (-1.86%) qps 319955.80 313990.12 (-1.86%)

it seems the patchset is bringing some regression for this particular case. will need more thinking to figure out a better approach.

...

...
Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0

Thanks barry

Yicong Yang

2:14 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/4 20:12, Barry Song wrote:

...

On Thu, Nov 4, 2021 at 11:39 PM Barry Song 21cnbao@gmail.com wrote:

...
On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
From: Barry Song song.bao.hua@hisilicon.com

For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, this patch moves to change the begin cpu of scanning in select_idle_cpu() from the next cpu of target to the first cpu of the target's cluster. Then the search will perform within the cluster first and we'll have more chance to wake the wakee in the same cluster of the waker.

Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA and on NUMA 0. Improvements are observed in most cases compared to 5.15-rc1 with cluster scheduler level[1].

hackbench-process-pipes 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.6136 ( 0.00%) 0.5988 ( 2.41%) Amean 4 0.8380 ( 0.00%) 0.8904 * -6.25%* Amean 7 1.1661 ( 0.00%) 1.1017 * 5.52%* Amean 12 1.4670 ( 0.00%) 1.5994 * -9.03%* Amean 21 2.8909 ( 0.00%) 2.8640 ( 0.93%) Amean 30 4.3943 ( 0.00%) 4.2052 ( 4.30%) Amean 48 6.6870 ( 0.00%) 6.4079 ( 4.17%) Amean 79 10.4796 ( 0.00%) 9.5507 * 8.86%* Amean 110 14.5310 ( 0.00%) 12.2114 * 15.96%* Amean 141 16.4772 ( 0.00%) 14.1517 * 14.11%* Amean 172 20.0868 ( 0.00%) 15.9852 * 20.42%* Amean 203 22.9282 ( 0.00%) 18.4574 * 19.50%* Amean 234 25.8139 ( 0.00%) 20.4725 * 20.69%* Amean 256 27.6834 ( 0.00%) 22.9076 * 17.25%*

tbench4 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 338.50 ( 0.00%) 345.47 * 2.06%* Hmean 2 672.20 ( 0.00%) 695.10 * 3.41%* Hmean 4 1329.03 ( 0.00%) 1357.40 * 2.14%* Hmean 8 2513.25 ( 0.00%) 2419.88 * -3.71%* Hmean 16 4957.39 ( 0.00%) 4882.04 * -1.52%* Hmean 32 8737.07 ( 0.00%) 8649.97 * -1.00%* Hmean 64 4929.31 ( 0.00%) 6570.13 * 33.29%* Hmean 128 5052.75 ( 0.00%) 8157.96 * 61.46%* Hmean 256 6971.70 ( 0.00%) 7648.01 * 9.70%* Hmean 512 7427.32 ( 0.00%) 7450.68 * 0.31%*

tbench4 NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 318.98 ( 0.00%) 322.53 * 1.11%* Hmean 2 640.50 ( 0.00%) 641.89 * 0.22%* Hmean 4 1277.57 ( 0.00%) 1292.54 * 1.17%* Hmean 8 2584.55 ( 0.00%) 2622.64 * 1.47%* Hmean 16 5245.05 ( 0.00%) 5440.75 * 3.73%* Hmean 32 3231.60 ( 0.00%) 3991.83 * 23.52%* Hmean 64 7361.28 ( 0.00%) 7356.56 ( -0.06%) Hmean 128 6240.28 ( 0.00%) 6293.78 * 0.86%*

hackbench-process-pipes NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.5196 ( 0.00%) 0.5121 ( 1.44%) Amean 4 1.0946 ( 0.00%) 1.3234 * -20.90%* Amean 7 1.9368 ( 0.00%) 2.4304 * -25.49%* Amean 12 3.4168 ( 0.00%) 3.6422 * -6.60%* Amean 21 6.1119 ( 0.00%) 5.5032 * 9.96%* Amean 30 7.8980 ( 0.00%) 7.5433 * 4.49%* Amean 48 11.2969 ( 0.00%) 10.6889 * 5.38%* Amean 79 17.3220 ( 0.00%) 15.2553 * 11.93%* Amean 110 22.9893 ( 0.00%) 19.8521 * 13.65%* Amean 141 28.5319 ( 0.00%) 24.9064 * 12.71%* Amean 172 34.1731 ( 0.00%) 30.8424 * 9.75%* Amean 203 39.9368 ( 0.00%) 35.4607 * 11.21%* Amean 234 45.6207 ( 0.00%) 40.4969 * 11.23%* Amean 256 50.0725 ( 0.00%) 45.0295 * 10.07%*

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

the patchset is causing a kernel panic during kexec reboot:

[ 1254.167993] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000120 [ 1254.176771] Mem abort info: [ 1254.179551] ESR = 0x96000004 [ 1254.182596] EC = 0x25: DABT (current EL), IL = 32 bits [ 1254.187899] SET = 0, FnV = 0 [ 1254.190944] EA = 0, S1PTW = 0 [ 1254.194076] FSC = 0x04: level 0 translation fault [ 1254.198944] Data abort info: [ 1254.201815] ISV = 0, ISS = 0x00000004 [ 1254.205643] CM = 0, WnR = 0 [ 1254.208604] user pgtable: 4k pages16] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 1254.227375] Modules linked in: [ 1254.230416] CPU: 0 PID: 786 Comm: kworker/1:2 Not tainted 5.15.0-rc1-00005-g4c1b4a4d90b6-dirty #302 [ 1254.239447] Hardware name: Huawei XA320 V2 /BC82HPNBB, BIOS 0.86 07/19/2019 [ 1254.246393] Workqueue: events cpuset_hotplug_workfn [ 1254.251263] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1254.258211] pc : __bitmap_weight+0x30/0x90 [ 1254.262297] lr : cpu_attach_domain+0x1ec/0x838 [ 1254.266729] sp : ffff8000238fba10 [ 1254.270029] x29: ffff8000238fba10 x28: ffff204000059f00 x27: 0000000000000000 [ 1254.277151] x26: ffff800010e3a238 x25: 0000000000000001 x24: ffff8000117858f0 [ 1254.284274] x23: 0000000000000100 x22: 0000000000000004 x21: 0000000000000120 [ 1254.291395] x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000001 [ 1254.298517] x17: 0000000000000000 x16: 00000000000006d4 x15: 00000000000006d1 [ 1254.305639] x14: 0000000000000002 x13: 0000000000000000 x12: 0000000000000000 [ 1254.312760] x11: 00000000000000c0 x10: 0000000000000a80 x9 : 0000000000000001 [ 1254.319882] x8 : ffff002080410000 x7 : 0000000000000000 x6 : 0000000000000000 [ 1254.327004] x5 : ffff800011f60b00 x4 : 00000000002dc6c0 x3 : ffff803f6e3fd000 [ 1254.334126] x2 : 0000000000000000 x1 : 0000000000000100 x0 : 0000000000000120 [ 1254.341247] Call trace: [ 1254.343680] __bitmap_weight+0x30/0x90 [ 1254.347416] cpu_attach_domain+0x1ec/0x838 [ 1254.351499] partition_sched_domains_locked+0x12c/0x908 [ 1254.356711] rebuild_sched_domains_locked+0x384/0x800 [ 1254.361749] rebuild_sched_domains+0x24/0x40 [ 1254.366006] cpuset_hotplug_workfn+0x34c/0x548 [ 1254.370437] process_one_work+0x1bc/0x338 [ 1254.374433] worker_thread+0x48/0x418 [ 1254.378081] kthread+0x14c/0x158 [ 1254.381297] ret_from_fork+0x10/0x20 [ 1254.384861] Code: 2a0103f7 54000300 d2800013 52800014 (f8737aa0) [ 1254.390940] ---[ end trace 179fc74a465f3bec ]---

sorry. pls ignore the noise. it was made by my local debug code.

one benchmark result:

running sysbench on numa0-1(cpu0-cpu63), and running mysqld on numa2-3(cpu64-127)

sysbench command as below: numactl -C 0-63 sysbench --db-driver=mysql --mysql-user=sbtest_user \ --mysql_password=password --mysql-db=sbtest --mysql-host=127.0.0.1 \ --mysql-port=3306 --point-selects=10 --simple-ranges=1 --sum-ranges=1 \ --order-ranges=1 --distinct-ranges=1 --delete_inserts=1 --index-updates=1 \ --non-index-updates=1 --delete-inserts=1 --range-size=100 --time=600 \ --events=0 --report-interval=60 --tables=64 --table-size=2000000 \ --threads=64 /usr/share/sysbench/oltp_write_only.lua run
   w/o patchset          w/ patchset
tps 53325.97 52331.69 (-1.86%) qps 319955.80 313990.12 (-1.86%)

it seems the patchset is bringing some regression for this particular case. will need more thinking to figure out a better approach.

I established a mysql environment on my server and interestingly I got some different result.

since my SDD locates on numa0, I bind mysqld on cpu0-63 and bind sysbench on cpu64-127.

I know little about mysql so I run a very basic sysbench command like below: numactl -C 64-127 /mnt/sde/sysbench-1.0.20/INSTALL/bin/sysbench \ /mnt/sde/sysbench-1.0.20/INSTALL/share/sysbench/oltp_read_write.lua \ --mysql-host=localhost \ --mysql-port=3306 \ --mysql-user=root \ --mysql-db=test \ --db-driver=mysql \ --report-interval=10 \ --tables=12 \ --table-size=1000000 \ --threads=64 \ --time=120 \ --events=0 \ run

w/o patchset w/ patchset tps 20073.61 21510.50 (+7.16%) qps 401472.28 430209.96 (+7.16%) avg lat 3.19 2.97 (+6.90%)

The tables and table size differ, so maybe under this case of mysql we can get enhancement.

...

...
...
Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0
Thanks barry .

Barry Song

11:29 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Fri, Nov 5, 2021 at 2:14 AM Yicong Yang yangyicong@hisilicon.com wrote:

...

On 2021/11/4 20:12, Barry Song wrote:

...
On Thu, Nov 4, 2021 at 11:39 PM Barry Song 21cnbao@gmail.com wrote:

...
On Thu, Oct 28, 2021 at 9:18 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
From: Barry Song song.bao.hua@hisilicon.com

For platforms having clusters like Kunpeng 920, tasks in the same cluster sharing L3 Cache Tag will have lower latency when synchronizing and accessing shared resources. Based on this, this patch moves to change the begin cpu of scanning in select_idle_cpu() from the next cpu of target to the first cpu of the target's cluster. Then the search will perform within the cluster first and we'll have more chance to wake the wakee in the same cluster of the waker.

Benchmark Tests have been done on 2-socket 4-NUMA Kunpeng 920 with 8 clusters in each NUMA and on NUMA 0. Improvements are observed in most cases compared to 5.15-rc1 with cluster scheduler level[1].

hackbench-process-pipes 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.6136 ( 0.00%) 0.5988 ( 2.41%) Amean 4 0.8380 ( 0.00%) 0.8904 * -6.25%* Amean 7 1.1661 ( 0.00%) 1.1017 * 5.52%* Amean 12 1.4670 ( 0.00%) 1.5994 * -9.03%* Amean 21 2.8909 ( 0.00%) 2.8640 ( 0.93%) Amean 30 4.3943 ( 0.00%) 4.2052 ( 4.30%) Amean 48 6.6870 ( 0.00%) 6.4079 ( 4.17%) Amean 79 10.4796 ( 0.00%) 9.5507 * 8.86%* Amean 110 14.5310 ( 0.00%) 12.2114 * 15.96%* Amean 141 16.4772 ( 0.00%) 14.1517 * 14.11%* Amean 172 20.0868 ( 0.00%) 15.9852 * 20.42%* Amean 203 22.9282 ( 0.00%) 18.4574 * 19.50%* Amean 234 25.8139 ( 0.00%) 20.4725 * 20.69%* Amean 256 27.6834 ( 0.00%) 22.9076 * 17.25%*

tbench4 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 338.50 ( 0.00%) 345.47 * 2.06%* Hmean 2 672.20 ( 0.00%) 695.10 * 3.41%* Hmean 4 1329.03 ( 0.00%) 1357.40 * 2.14%* Hmean 8 2513.25 ( 0.00%) 2419.88 * -3.71%* Hmean 16 4957.39 ( 0.00%) 4882.04 * -1.52%* Hmean 32 8737.07 ( 0.00%) 8649.97 * -1.00%* Hmean 64 4929.31 ( 0.00%) 6570.13 * 33.29%* Hmean 128 5052.75 ( 0.00%) 8157.96 * 61.46%* Hmean 256 6971.70 ( 0.00%) 7648.01 * 9.70%* Hmean 512 7427.32 ( 0.00%) 7450.68 * 0.31%*

tbench4 NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Hmean 1 318.98 ( 0.00%) 322.53 * 1.11%* Hmean 2 640.50 ( 0.00%) 641.89 * 0.22%* Hmean 4 1277.57 ( 0.00%) 1292.54 * 1.17%* Hmean 8 2584.55 ( 0.00%) 2622.64 * 1.47%* Hmean 16 5245.05 ( 0.00%) 5440.75 * 3.73%* Hmean 32 3231.60 ( 0.00%) 3991.83 * 23.52%* Hmean 64 7361.28 ( 0.00%) 7356.56 ( -0.06%) Hmean 128 6240.28 ( 0.00%) 6293.78 * 0.86%*

hackbench-process-pipes NUMA 0 5.15-rc1+cluster 5.15-rc1+cluster+patch Amean 1 0.5196 ( 0.00%) 0.5121 ( 1.44%) Amean 4 1.0946 ( 0.00%) 1.3234 * -20.90%* Amean 7 1.9368 ( 0.00%) 2.4304 * -25.49%* Amean 12 3.4168 ( 0.00%) 3.6422 * -6.60%* Amean 21 6.1119 ( 0.00%) 5.5032 * 9.96%* Amean 30 7.8980 ( 0.00%) 7.5433 * 4.49%* Amean 48 11.2969 ( 0.00%) 10.6889 * 5.38%* Amean 79 17.3220 ( 0.00%) 15.2553 * 11.93%* Amean 110 22.9893 ( 0.00%) 19.8521 * 13.65%* Amean 141 28.5319 ( 0.00%) 24.9064 * 12.71%* Amean 172 34.1731 ( 0.00%) 30.8424 * 9.75%* Amean 203 39.9368 ( 0.00%) 35.4607 * 11.21%* Amean 234 45.6207 ( 0.00%) 40.4969 * 11.23%* Amean 256 50.0725 ( 0.00%) 45.0295 * 10.07%*

[1] https://lore.kernel.org/lkml/20210924085104.44806-1-21cnbao@gmail.com/

the patchset is causing a kernel panic during kexec reboot:

[ 1254.167993] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000120 [ 1254.176771] Mem abort info: [ 1254.179551] ESR = 0x96000004 [ 1254.182596] EC = 0x25: DABT (current EL), IL = 32 bits [ 1254.187899] SET = 0, FnV = 0 [ 1254.190944] EA = 0, S1PTW = 0 [ 1254.194076] FSC = 0x04: level 0 translation fault [ 1254.198944] Data abort info: [ 1254.201815] ISV = 0, ISS = 0x00000004 [ 1254.205643] CM = 0, WnR = 0 [ 1254.208604] user pgtable: 4k pages16] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 1254.227375] Modules linked in: [ 1254.230416] CPU: 0 PID: 786 Comm: kworker/1:2 Not tainted 5.15.0-rc1-00005-g4c1b4a4d90b6-dirty #302 [ 1254.239447] Hardware name: Huawei XA320 V2 /BC82HPNBB, BIOS 0.86 07/19/2019 [ 1254.246393] Workqueue: events cpuset_hotplug_workfn [ 1254.251263] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1254.258211] pc : __bitmap_weight+0x30/0x90 [ 1254.262297] lr : cpu_attach_domain+0x1ec/0x838 [ 1254.266729] sp : ffff8000238fba10 [ 1254.270029] x29: ffff8000238fba10 x28: ffff204000059f00 x27: 0000000000000000 [ 1254.277151] x26: ffff800010e3a238 x25: 0000000000000001 x24: ffff8000117858f0 [ 1254.284274] x23: 0000000000000100 x22: 0000000000000004 x21: 0000000000000120 [ 1254.291395] x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000001 [ 1254.298517] x17: 0000000000000000 x16: 00000000000006d4 x15: 00000000000006d1 [ 1254.305639] x14: 0000000000000002 x13: 0000000000000000 x12: 0000000000000000 [ 1254.312760] x11: 00000000000000c0 x10: 0000000000000a80 x9 : 0000000000000001 [ 1254.319882] x8 : ffff002080410000 x7 : 0000000000000000 x6 : 0000000000000000 [ 1254.327004] x5 : ffff800011f60b00 x4 : 00000000002dc6c0 x3 : ffff803f6e3fd000 [ 1254.334126] x2 : 0000000000000000 x1 : 0000000000000100 x0 : 0000000000000120 [ 1254.341247] Call trace: [ 1254.343680] __bitmap_weight+0x30/0x90 [ 1254.347416] cpu_attach_domain+0x1ec/0x838 [ 1254.351499] partition_sched_domains_locked+0x12c/0x908 [ 1254.356711] rebuild_sched_domains_locked+0x384/0x800 [ 1254.361749] rebuild_sched_domains+0x24/0x40 [ 1254.366006] cpuset_hotplug_workfn+0x34c/0x548 [ 1254.370437] process_one_work+0x1bc/0x338 [ 1254.374433] worker_thread+0x48/0x418 [ 1254.378081] kthread+0x14c/0x158 [ 1254.381297] ret_from_fork+0x10/0x20 [ 1254.384861] Code: 2a0103f7 54000300 d2800013 52800014 (f8737aa0) [ 1254.390940] ---[ end trace 179fc74a465f3bec ]---

sorry. pls ignore the noise. it was made by my local debug code.

one benchmark result:

running sysbench on numa0-1(cpu0-cpu63), and running mysqld on numa2-3(cpu64-127)

sysbench command as below: numactl -C 0-63 sysbench --db-driver=mysql --mysql-user=sbtest_user \ --mysql_password=password --mysql-db=sbtest --mysql-host=127.0.0.1 \ --mysql-port=3306 --point-selects=10 --simple-ranges=1 --sum-ranges=1 \ --order-ranges=1 --distinct-ranges=1 --delete_inserts=1 --index-updates=1 \ --non-index-updates=1 --delete-inserts=1 --range-size=100 --time=600 \ --events=0 --report-interval=60 --tables=64 --table-size=2000000 \ --threads=64 /usr/share/sysbench/oltp_write_only.lua run
   w/o patchset          w/ patchset
tps 53325.97 52331.69 (-1.86%) qps 319955.80 313990.12 (-1.86%)

it seems the patchset is bringing some regression for this particular case. will need more thinking to figure out a better approach.
I established a mysql environment on my server and interestingly I got some different result.

since my SDD locates on numa0, I bind mysqld on cpu0-63 and bind sysbench on cpu64-127.

I know little about mysql so I run a very basic sysbench command like below: numactl -C 64-127 /mnt/sde/sysbench-1.0.20/INSTALL/bin/sysbench \ /mnt/sde/sysbench-1.0.20/INSTALL/share/sysbench/oltp_read_write.lua \ --mysql-host=localhost \ --mysql-port=3306 \ --mysql-user=root \ --mysql-db=test \ --db-driver=mysql \ --report-interval=10 \ --tables=12 \ --table-size=1000000 \ --threads=64 \ --time=120 \ --events=0 \ run
            w/o patchset    w/ patchset
tps 20073.61 21510.50 (+7.16%) qps 401472.28 430209.96 (+7.16%) avg lat 3.19 2.97 (+6.90%)

with the same tables(12) and table_size(1000000) as below: numactl -C 0-63 sysbench --db-driver=mysql --mysql-user=sbtest_user\ --mysql_password=password --mysql-db=sbtest --mysql-host=127.0.0.1 \ --mysql-port=3306 --point-selects=10 --simple-ranges=1 --sum-ranges=1 \ --order-ranges=1 --distinct-ranges=1 --index-updates=1 --non-index-updates=1 \ --delete-inserts=1 --range-size=100 --time=600 --events=0 --report-interval=60 \ --tables=12 --table-size=1000000 --threads=64 \ /usr/share/sysbench/oltp_read_write.lua run

if i run the same test by warming up the database for 1 min with 12 seconds as report-interval, then run oltp_read_write.lua for 10min with 60seconds as report-interval twice, and collect the result of the 2nd 10min, i will get something like:

w/o patchset w/ patchset tps 17929.95 18074.94 (+0.8%) qps 358599.07 361498.71

...

The tables and table size differ, so maybe under this case of mysql we can get enhancement.

...
...
...
Signed-off-by: Barry Song song.bao.hua@hisilicon.com Signed-off-by: Yicong Yang yangyicong@hisilicon.com

kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0

Thanks barry

Barry Song

11 Nov 11 Nov

10:46 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

...

...
...
...
...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

...

  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);

- cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.

...

  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;

@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }

  for_each_cpu_wrap(cpu, cpus, target + 1) {

  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)

-- 2.33.0

Thanks barry

Yicong Yang

12 Nov 12 Nov

1:05 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/11 5:46, Barry Song wrote:

...

...
...
...
...
...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
May double test if it is necessary to clear the target as target can be idle after it was scanned.

These two situations make little difference in my previous test. And as we discussed, if the nr throttling works we won't scan the whole LLC in most times.

well I'll test in these two cases with below fixed to see whether there is an obvious difference.

Thanks, Yicong

...

...
...
...
...
...
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.

...
...
...
...
...
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0
Thanks barry .

Yicong Yang

1:30 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/11 5:46, Barry Song wrote:

...

...
...
...
...
...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..852a048a5f8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return -1;
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cpumask_clear_cpu(target, cpus);
May double test if it is necessary to clear the target as target can be idle after it was scanned.

...
...
...
...
...
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.

Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).

...

...
...
...
...
...
  scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
@@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
-- 2.33.0
Thanks barry .

Barry Song

8:24 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...

On 2021/11/11 5:46, Barry Song wrote:

...
...
...
...
...
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index ff69f245b939..852a048a5f8c 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd > static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) > { > struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); > - int i, cpu, idle_cpu = -1, nr = INT_MAX; > + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; > + struct sched_domain *this_sd, *cluster_sd; > struct rq *this_rq = this_rq(); > int this = smp_processor_id(); > - struct sched_domain *this_sd; > u64 time = 0; > > this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); > @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > return -1; > > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

...
...
...
...
> + > + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).

we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.

6470 sd = rcu_dereference(per_cpu(sd_llc, target)); 6471 if (!sd) 6472 return target; 6473 6474 if (sched_smt_active()) { 6475 has_idle_core = test_idle_cores(target, false); 6476 6477 if (!has_idle_core && cpus_share_cache(prev, target)) { 6478 i = select_idle_smt(p, sd, prev); 6479 if ((unsigned int)i < nr_cpumask_bits) 6480 return i; 6481 } 6482 } 6483 6484 i = select_idle_cpu(p, sd, has_idle_core, target); 6485 if ((unsigned)i < nr_cpumask_bits) 6486 return i; 6487 6488 return target; 6489 }

...

...
...
...
...
...
> + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; > > if (sched_feat(SIS_PROP) && !has_idle_core) { > u64 avg_cost, avg_idle, span_avg; > @@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool > time = cpu_clock(this); > } > > - for_each_cpu_wrap(cpu, cpus, target + 1) { > + for_each_cpu_wrap(cpu, cpus, scan_from) { > if (has_idle_core) { > i = select_idle_core(p, cpu, cpus, &idle_cpu); > if ((unsigned int)i < nr_cpumask_bits) > -- > 2.33.0 >

Thanks Barry

Yicong Yang

2:13 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/12 3:24, Barry Song wrote:

...

On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/11 5:46, Barry Song wrote:

...
...
...
...
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index ff69f245b939..852a048a5f8c 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >> { >> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >> + struct sched_domain *this_sd, *cluster_sd; >> struct rq *this_rq = this_rq(); >> int this = smp_processor_id(); >> - struct sched_domain *this_sd; >> u64 time = 0; >> >> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >> return -1; >> >> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >> + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

...
...
...
>> + >> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.

you're right. I misread here.

...

6470 sd = rcu_dereference(per_cpu(sd_llc, target)); 6471 if (!sd) 6472 return target; 6473 6474 if (sched_smt_active()) { 6475 has_idle_core = test_idle_cores(target, false); 6476 6477 if (!has_idle_core && cpus_share_cache(prev, target)) { 6478 i = select_idle_smt(p, sd, prev); 6479 if ((unsigned int)i < nr_cpumask_bits) 6480 return i; 6481 } 6482 } 6483 6484 i = select_idle_cpu(p, sd, has_idle_core, target); 6485 if ((unsigned)i < nr_cpumask_bits) 6486 return i; 6487 6488 return target; 6489 }

...
...
...
...
...
>> + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; >> >> if (sched_feat(SIS_PROP) && !has_idle_core) { >> u64 avg_cost, avg_idle, span_avg; >> @@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >> time = cpu_clock(this); >> } >> >> - for_each_cpu_wrap(cpu, cpus, target + 1) { >> + for_each_cpu_wrap(cpu, cpus, scan_from) { >> if (has_idle_core) { >> i = select_idle_core(p, cpu, cpus, &idle_cpu); >> if ((unsigned int)i < nr_cpumask_bits) >> -- >> 2.33.0 >>

Thanks Barry .

Barry Song

19 Nov 19 Nov

2:48 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...

On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/11 5:46, Barry Song wrote:

...
...
...
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>> index ff69f245b939..852a048a5f8c 100644 >>> --- a/kernel/sched/fair.c >>> +++ b/kernel/sched/fair.c >>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >>> { >>> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>> + struct sched_domain *this_sd, *cluster_sd; >>> struct rq *this_rq = this_rq(); >>> int this = smp_processor_id(); >>> - struct sched_domain *this_sd; >>> u64 time = 0; >>> >>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>> return -1; >>> >>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>> + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

...
...
>>> + >>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.

I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int i, cpu, idle_cpu = -1, nr = INT_MAX; + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; + struct sched_domain *this_sd, *cluster_sd; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; u64 time = 0;

this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); @@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool

cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);

+ cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; + if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies; @@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }

- for_each_cpu_wrap(cpu, cpus, target + 1) { + for_each_cpu_wrap(cpu, cpus, scan_from) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); if ((unsigned int)i < nr_cpumask_bits)

tbench4 on 4NUMA machine:

Vallin Scancluster Vallina Scancluster/ Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*

...

...
6470 sd = rcu_dereference(per_cpu(sd_llc, target)); 6471 if (!sd) 6472 return target; 6473 6474 if (sched_smt_active()) { 6475 has_idle_core = test_idle_cores(target, false); 6476 6477 if (!has_idle_core && cpus_share_cache(prev, target)) { 6478 i = select_idle_smt(p, sd, prev); 6479 if ((unsigned int)i < nr_cpumask_bits) 6480 return i; 6481 } 6482 } 6483 6484 i = select_idle_cpu(p, sd, has_idle_core, target); 6485 if ((unsigned)i < nr_cpumask_bits) 6486 return i; 6487 6488 return target; 6489 }

...
...
...
...
>>> + scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; >>> >>> if (sched_feat(SIS_PROP) && !has_idle_core) { >>> u64 avg_cost, avg_idle, span_avg; >>> @@ -6305,7 +6309,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>> time = cpu_clock(this); >>> } >>> >>> - for_each_cpu_wrap(cpu, cpus, target + 1) { >>> + for_each_cpu_wrap(cpu, cpus, scan_from) { >>> if (has_idle_core) { >>> i = select_idle_core(p, cpu, cpus, &idle_cpu); >>> if ((unsigned int)i < nr_cpumask_bits) >>> -- >>> 2.33.0 >>>

Thanks Barry .

Barry Song

8:13 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...

On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/11 5:46, Barry Song wrote:

...
...
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>> index ff69f245b939..852a048a5f8c 100644 >>>> --- a/kernel/sched/fair.c >>>> +++ b/kernel/sched/fair.c >>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >>>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >>>> { >>>> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>> + struct sched_domain *this_sd, *cluster_sd; >>>> struct rq *this_rq = this_rq(); >>>> int this = smp_processor_id(); >>>> - struct sched_domain *this_sd; >>>> u64 time = 0; >>>> >>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>> return -1; >>>> >>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>> + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

...
>>>> + >>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*

Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight; struct sched_domain *this_sd, *cluster_sd; struct rq *this_rq = this_rq(); int this = smp_processor_id(); @@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool

cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); scan_from = cluster_sd ? cpumask_first(sched_domain_span(cluster_sd)) : target + 1; + cluster_weight = cluster_sd ? cpumask_weight(sched_domain_span(cluster_sd)) : 0;

if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; @@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;

+ /* compensation for CPUs outside the cluster */ + nr += cluster_weight/2; + time = cpu_clock(this); }

Thanks Barry

Barry Song

9:21 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On Fri, Nov 19, 2021 at 8:13 PM Barry Song 21cnbao@gmail.com wrote:

...

On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/11 5:46, Barry Song wrote:

...
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>> index ff69f245b939..852a048a5f8c 100644 >>>>> --- a/kernel/sched/fair.c >>>>> +++ b/kernel/sched/fair.c >>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >>>>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >>>>> { >>>>> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >>>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>>> + struct sched_domain *this_sd, *cluster_sd; >>>>> struct rq *this_rq = this_rq(); >>>>> int this = smp_processor_id(); >>>>> - struct sched_domain *this_sd; >>>>> u64 time = 0; >>>>> >>>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>> return -1; >>>>> >>>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>>> + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

>>>>> + >>>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  cluster_weight = cluster_sd ?
cpumask_weight(sched_domain_span(cluster_sd)) : 0;

should be: cluster_weight = cluster_sd ? cluster_sd->span_weight : 0;

as cpumask_weight might be expensive.

...

    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;
@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
          /* compensation for CPUs outside the cluster */
          nr += cluster_weight/2;
          time = cpu_clock(this);
  }
Thanks Barry

Yicong Yang

9:50 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 15:13, Barry Song wrote:

...

On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/11 5:46, Barry Song wrote:

...
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>> index ff69f245b939..852a048a5f8c 100644 >>>>> --- a/kernel/sched/fair.c >>>>> +++ b/kernel/sched/fair.c >>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd >>>>> static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) >>>>> { >>>>> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); >>>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>>> + struct sched_domain *this_sd, *cluster_sd; >>>>> struct rq *this_rq = this_rq(); >>>>> int this = smp_processor_id(); >>>>> - struct sched_domain *this_sd; >>>>> u64 time = 0; >>>>> >>>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool >>>>> return -1; >>>>> >>>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>>> + cpumask_clear_cpu(target, cpus);

May double test if it is necessary to clear the target as target can be idle after it was scanned.

>>>>> + >>>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));

This line is wrong. should be:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 852a048..0a946ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); cpumask_clear_cpu(target, cpus);
  cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster));
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;

testing needs to be done again with this fix.
Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're cases that target and this cpu from different NUMA(=LLC on 920).
we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.
you're right. I misread here.
I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?

I'll have a try.

...

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  cluster_weight = cluster_sd ?
cpumask_weight(sched_domain_span(cluster_sd)) : 0;
    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;
@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
          /* compensation for CPUs outside the cluster */
          nr += cluster_weight/2;

any certain reason for the divison of 2? or just another heuristic number?

...

            time = cpu_clock(this);
    }

Thanks Barry .

Song Bao Hua (Barry Song)

10:27 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

...

-----Original Message----- From: yangyicong Sent: Friday, November 19, 2021 9:50 PM To: Barry Song 21cnbao@gmail.com Cc: yangyicong yangyicong@huawei.com; Tim Chen tim.c.chen@linux.intel.com; linaro-open-discussions@op-lists.linaro.org; yangyicong yangyicong@huawei.com; Song Bao Hua (Barry Song) song.bao.hua@hisilicon.com; Zengtao (B) prime.zeng@hisilicon.com; Jonathan Cameron jonathan.cameron@huawei.com; shenyang (M) shenyang39@huawei.com; tangchengchang tangchengchang@huawei.com; Linuxarm linuxarm@huawei.com Subject: Re: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 15:13, Barry Song wrote:

...
On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com

wrote:

...
...
...
...
...
On 2021/11/11 5:46, Barry Song wrote: >>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>> index ff69f245b939..852a048a5f8c 100644 >>>>>> --- a/kernel/sched/fair.c >>>>>> +++ b/kernel/sched/fair.c >>>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct

task_struct *p, struct sched_domain *sd

...
...
...
...
...
>>>>>> static int select_idle_cpu(struct task_struct *p, struct

sched_domain *sd, bool has_idle_core, int target)

...
...
...
...
...
>>>>>> { >>>>>> struct cpumask *cpus =

this_cpu_cpumask_var_ptr(select_idle_mask);

...
...
...
...
...
>>>>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>>>> + struct sched_domain *this_sd, *cluster_sd; >>>>>> struct rq *this_rq = this_rq(); >>>>>> int this = smp_processor_id(); >>>>>> - struct sched_domain *this_sd; >>>>>> u64 time = 0; >>>>>> >>>>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct

task_struct *p, struct sched_domain *sd, bool

...
...
...
...
...
>>>>>> return -1; >>>>>> >>>>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>>>> + cpumask_clear_cpu(target, cpus); > > May double test if it is necessary to clear the target as target can > be idle after it was scanned. > >>>>>> + >>>>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); > > This line is wrong. should be: > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 852a048..0a946ba 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct > *p, struct sched_domain *sd, bool > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); > cpumask_clear_cpu(target, cpus); > > - cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); > + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); > scan_from = cluster_sd ? > cpumask_first(sched_domain_span(cluster_sd)) : target + 1; > > testing needs to be done again with this fix. >

Since we also need to reconsider which LLC domain we should take?

Currently we take the LLC of this CPU and as tested by @shenyang, there're

cases

...
...
...
...
...
that target and this cpu from different NUMA(=LLC on 920).

we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.

you're right. I misread here.

I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?
I'll have a try.

...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  cluster_weight = cluster_sd ?
cpumask_weight(sched_domain_span(cluster_sd)) : 0;
    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;
@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
          /* compensation for CPUs outside the cluster */
          nr += cluster_weight/2;
any certain reason for the divison of 2? or just another heuristic number?

If target is the first cpu of cluster, we actually haven't changed the behavior. If target is the last cpu of cluster, cpus outside the cluster lose the opportunity to be scanned in the number of the whole cluster.

So averagely CPUs outside the cluster lose the number of 1/2 cluster. Not quite sure if this is going to work. For example, while nr=4, system is quite busy, probably we don't need to do any compensation.

...

            time = cpu_clock(this);
    }

Thanks Barry

Yicong Yang

10:40 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 17:27, Song Bao Hua (Barry Song) wrote:

...

...
-----Original Message----- From: yangyicong Sent: Friday, November 19, 2021 9:50 PM To: Barry Song 21cnbao@gmail.com Cc: yangyicong yangyicong@huawei.com; Tim Chen tim.c.chen@linux.intel.com; linaro-open-discussions@op-lists.linaro.org; yangyicong yangyicong@huawei.com; Song Bao Hua (Barry Song) song.bao.hua@hisilicon.com; Zengtao (B) prime.zeng@hisilicon.com; Jonathan Cameron jonathan.cameron@huawei.com; shenyang (M) shenyang39@huawei.com; tangchengchang tangchengchang@huawei.com; Linuxarm linuxarm@huawei.com Subject: Re: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 15:13, Barry Song wrote:

...
On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com wrote:

...
On 2021/11/12 3:24, Barry Song wrote:

...
On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com

wrote:

...
...
...
...
> > On 2021/11/11 5:46, Barry Song wrote: >>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>>> index ff69f245b939..852a048a5f8c 100644 >>>>>>> --- a/kernel/sched/fair.c >>>>>>> +++ b/kernel/sched/fair.c >>>>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct

task_struct *p, struct sched_domain *sd

...
...
...
...
>>>>>>> static int select_idle_cpu(struct task_struct *p, struct

sched_domain *sd, bool has_idle_core, int target)

...
...
...
...
>>>>>>> { >>>>>>> struct cpumask *cpus =

this_cpu_cpumask_var_ptr(select_idle_mask);

...
...
...
...
>>>>>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>>>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>>>>> + struct sched_domain *this_sd, *cluster_sd; >>>>>>> struct rq *this_rq = this_rq(); >>>>>>> int this = smp_processor_id(); >>>>>>> - struct sched_domain *this_sd; >>>>>>> u64 time = 0; >>>>>>> >>>>>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct

task_struct *p, struct sched_domain *sd, bool

...
...
...
...
>>>>>>> return -1; >>>>>>> >>>>>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>>>>> + cpumask_clear_cpu(target, cpus); >> >> May double test if it is necessary to clear the target as target can >> be idle after it was scanned. >> >>>>>>> + >>>>>>> + cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); >> >> This line is wrong. should be: >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index 852a048..0a946ba 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct >> *p, struct sched_domain *sd, bool >> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >> cpumask_clear_cpu(target, cpus); >> >> - cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); >> + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); >> scan_from = cluster_sd ? >> cpumask_first(sched_domain_span(cluster_sd)) : target + 1; >> >> testing needs to be done again with this fix. >> > > Since we also need to reconsider which LLC domain we should take? > > Currently we take the LLC of this CPU and as tested by @shenyang, there're

cases

...
...
...
...
> that target and this cpu from different NUMA(=LLC on 920).

we are scanning the cluster or the llc of the target CPU on this cpu. we are not taking the LLC of this CPU at all as sd has been set to LLC by select_idle_sibling before select_idle_cpu is called.

you're right. I misread here.

I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?
I'll have a try.

...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  cluster_weight = cluster_sd ?
cpumask_weight(sched_domain_span(cluster_sd)) : 0;
    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;
@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
          /* compensation for CPUs outside the cluster */
          nr += cluster_weight/2;
any certain reason for the divison of 2? or just another heuristic number?
If target is the first cpu of cluster, we actually haven't changed the behavior. If target is the last cpu of cluster, cpus outside the cluster lose the opportunity to be scanned in the number of the whole cluster.

So averagely CPUs outside the cluster lose the number of 1/2 cluster. Not quite sure if this is going to work. For example, while nr=4, system is quite busy, probably we don't need to do any compensation.

Perhaps we can doing the test with SIS_PROP disabled to see whether it matters.

...

            time = cpu_clock(this);
    }

Thanks Barry

Song Bao Hua (Barry Song)

10:49 p.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

...

-----Original Message----- From: yangyicong Sent: Friday, November 19, 2021 10:40 PM To: Song Bao Hua (Barry Song) song.bao.hua@hisilicon.com; yangyicong yangyicong@huawei.com; Barry Song 21cnbao@gmail.com Cc: yangyicong yangyicong@huawei.com; Tim Chen tim.c.chen@linux.intel.com; linaro-open-discussions@op-lists.linaro.org; Zengtao (B) prime.zeng@hisilicon.com; Jonathan Cameron jonathan.cameron@huawei.com; shenyang (M) shenyang39@huawei.com; tangchengchang tangchengchang@huawei.com; Linuxarm linuxarm@huawei.com Subject: Re: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 17:27, Song Bao Hua (Barry Song) wrote:

...
...
-----Original Message----- From: yangyicong Sent: Friday, November 19, 2021 9:50 PM To: Barry Song 21cnbao@gmail.com Cc: yangyicong yangyicong@huawei.com; Tim Chen

tim.c.chen@linux.intel.com;

...
...
linaro-open-discussions@op-lists.linaro.org; yangyicong yangyicong@huawei.com; Song Bao Hua (Barry Song) song.bao.hua@hisilicon.com; Zengtao (B) prime.zeng@hisilicon.com; Jonathan Cameron jonathan.cameron@huawei.com; shenyang (M) shenyang39@huawei.com; tangchengchang tangchengchang@huawei.com; Linuxarm linuxarm@huawei.com Subject: Re: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

On 2021/11/19 15:13, Barry Song wrote:

...
On Fri, Nov 19, 2021 at 2:48 PM Barry Song 21cnbao@gmail.com wrote:

...
On Fri, Nov 12, 2021 at 2:13 PM Yicong Yang yangyicong@hisilicon.com

wrote:

...
...
...
...
...
On 2021/11/12 3:24, Barry Song wrote: > On Fri, Nov 12, 2021 at 1:31 AM Yicong Yang yangyicong@hisilicon.com

wrote:

...
...
...
>> >> On 2021/11/11 5:46, Barry Song wrote: >>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>>>>>>> index ff69f245b939..852a048a5f8c 100644 >>>>>>>> --- a/kernel/sched/fair.c >>>>>>>> +++ b/kernel/sched/fair.c >>>>>>>> @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct

task_struct *p, struct sched_domain *sd

...
...
...
>>>>>>>> static int select_idle_cpu(struct task_struct *p, struct

sched_domain *sd, bool has_idle_core, int target)

...
...
...
>>>>>>>> { >>>>>>>> struct cpumask *cpus =

this_cpu_cpumask_var_ptr(select_idle_mask);

...
...
...
>>>>>>>> - int i, cpu, idle_cpu = -1, nr = INT_MAX; >>>>>>>> + int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX; >>>>>>>> + struct sched_domain *this_sd, *cluster_sd; >>>>>>>> struct rq *this_rq = this_rq(); >>>>>>>> int this = smp_processor_id(); >>>>>>>> - struct sched_domain *this_sd; >>>>>>>> u64 time = 0; >>>>>>>> >>>>>>>> this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); >>>>>>>> @@ -6276,6 +6276,10 @@ static int select_idle_cpu(struct

task_struct *p, struct sched_domain *sd, bool

...
...
...
>>>>>>>> return -1; >>>>>>>> >>>>>>>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>>>>>>> + cpumask_clear_cpu(target, cpus); >>> >>> May double test if it is necessary to clear the target as target can >>> be idle after it was scanned. >>> >>>>>>>> + >>>>>>>> + cluster_sd =

rcu_dereference(*this_cpu_ptr(&sd_cluster));

...
...
...
...
...
>>> >>> This line is wrong. should be: >>> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >>> index 852a048..0a946ba 100644 >>> --- a/kernel/sched/fair.c >>> +++ b/kernel/sched/fair.c >>> @@ -6278,7 +6278,7 @@ static int select_idle_cpu(struct task_struct >>> *p, struct sched_domain *sd, bool >>> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); >>> cpumask_clear_cpu(target, cpus); >>> >>> - cluster_sd = rcu_dereference(*this_cpu_ptr(&sd_cluster)); >>> + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); >>> scan_from = cluster_sd ? >>> cpumask_first(sched_domain_span(cluster_sd)) : target + 1; >>> >>> testing needs to be done again with this fix. >>> >> >> Since we also need to reconsider which LLC domain we should take? >> >> Currently we take the LLC of this CPU and as tested by @shenyang, there're

cases

...
...
...
>> that target and this cpu from different NUMA(=LLC on 920). > > we are scanning the cluster or the llc of the target CPU on this cpu. > we are not taking the LLC of this CPU at all as sd has been set to > LLC by select_idle_sibling before select_idle_cpu is called. >

you're right. I misread here.

I've changed the patch as below: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..4918756 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,10 +6265,10 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus =

this_cpu_cpumask_var_ptr(select_idle_mask);

...
...
...
...
  int i, cpu, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
  struct sched_domain *this_sd;
  u64 time = 0;

  this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
@@ -6277,6 +6277,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
  scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  if (sched_feat(SIS_PROP) && !has_idle_core) {
          u64 avg_cost, avg_idle, span_avg;
          unsigned long now = jiffies;
@@ -6305,7 +6308,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); }
  for_each_cpu_wrap(cpu, cpus, target + 1) {
  for_each_cpu_wrap(cpu, cpus, scan_from) {
          if (has_idle_core) {
                  i = select_idle_core(p, cpu, cpus, &idle_cpu);
                  if ((unsigned int)i < nr_cpumask_bits)
tbench4 on 4NUMA machine:
                          Vallin            Scancluster
                         Vallina           Scancluster/
Hmean 1 248.79 ( 0.00%) 209.32 * -15.86%* Hmean 2 451.59 ( 0.00%) 458.94 * 1.63%* Hmean 4 1124.71 ( 0.00%) 1136.36 * 1.04%* Hmean 8 2121.84 ( 0.00%) 2253.71 * 6.21%* Hmean 16 4244.98 ( 0.00%) 4561.48 * 7.46%* Hmean 32 7405.50 ( 0.00%) 8322.98 * 12.39%* Hmean 64 4057.07 ( 0.00%) 4434.10 * 9.29%* Hmean 128 5796.01 ( 0.00%) 6005.13 * 3.61%* Hmean 256 7085.31 ( 0.00%) 7904.76 * 11.57%* Hmean 512 7531.25 ( 0.00%) 7363.00 * -2.23%*
Hi Yicong, Will this fix the regression for pgbench, and even further improve tbench and hackbench?
I'll have a try.

...
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4918756..c6ae05f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6265,7 +6265,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX;
  int i, cpu, scan_from, idle_cpu = -1, nr = INT_MAX, cluster_weight;
  struct sched_domain *this_sd, *cluster_sd;
  struct rq *this_rq = this_rq();
  int this = smp_processor_id();
@@ -6279,6 +6279,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
    cluster_sd = rcu_dereference(per_cpu(sd_cluster, target));
    scan_from = cluster_sd ?
cpumask_first(sched_domain_span(cluster_sd)) : target + 1;
  cluster_weight = cluster_sd ?
cpumask_weight(sched_domain_span(cluster_sd)) : 0;
    if (sched_feat(SIS_PROP) && !has_idle_core) {
            u64 avg_cost, avg_idle, span_avg;
@@ -6305,6 +6306,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool else nr = 4;
          /* compensation for CPUs outside the cluster */
          nr += cluster_weight/2;
any certain reason for the divison of 2? or just another heuristic number?
If target is the first cpu of cluster, we actually haven't changed the behavior. If target is the last cpu of cluster, cpus outside the cluster lose the
opportunity

...
to be scanned in the number of the whole cluster.

So averagely CPUs outside the cluster lose the number of 1/2 cluster. Not

quite

...
sure if this is going to work. For example, while nr=4, system is quite busy, probably we don't need to do any compensation.

Perhaps we can doing the test with SIS_PROP disabled to see whether it matters.

Yep. Another problem of scanning from the first cpu of cluster is that neither Linux nor hardware can guarantee cpu IDs are always continuous in a sched_domain. for example: cluster CPUs 4, 8, 12 ,16 LLC CPUs 4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20...

Then ....

...

            time = cpu_clock(this);
    }

Thanks Barry

Song Bao Hua (Barry Song)

20 Nov 20 Nov

12:04 a.m.

New subject: [PATCH 2/2] sched/fair: Scan from the first cpu of cluster if presents in select_idle_cpu

...

Yep. Another problem of scanning from the first cpu of cluster is that neither Linux nor hardware can guarantee cpu IDs are always continuous in a sched_domain. for example: cluster CPUs 4, 8, 12 ,16 LLC CPUs 4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20...

Then ....

Therefore, I am also taking the old solution back, but we might need to do some nr decrease especially when system is busy because we have scanned three CPUs before going to SIS_PROP.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..e39de86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6257,6 +6257,36 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd

#endif /* CONFIG_SCHED_SMT */

+#ifdef CONFIG_SCHED_CLUSTER +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i = -1, cpu; + + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + cpumask_clear_cpu(target, cpus); + + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + i = __select_idle_cpu(cpu, p); + if ((unsigned int)i < nr_cpumask_bits) + break; + } + } + + return i; +} +#else +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + return -1; +} +#endif + /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -6268,15 +6298,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool int i, cpu, idle_cpu = -1, nr = INT_MAX; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd, *cluster_sd; u64 time = 0;

this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1;

- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + /* scan cluster before scanning LLC */ + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); + if (cluster_sd) { + i = scan_cluster(p, cluster_sd, has_idle_core, target, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + }

+ /* scan LLC excluding cluster */ + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (cluster_sd) + cpumask_andnot(cpus, cpus, sched_domain_span(cluster_sd)); + if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies;

-- 1.8.3.1

1327

days inactive

1349

days old

linaro-open-discussions@op-lists.linaro.org

21 comments

participants

tags (0)

participants (3)

Barry Song
Song Bao Hua (Barry Song)
Yicong Yang