Yep. Another problem of scanning from the first cpu of cluster is that neither Linux nor hardware can guarantee cpu IDs are always continuous in a sched_domain. for example: cluster CPUs 4, 8, 12 ,16 LLC CPUs 4,5,6,7,8,9,10,12,13,14,15,16,17,18,19,20...
Then ....
Therefore, I am also taking the old solution back, but we might need to do some nr decrease especially when system is busy because we have scanned three CPUs before going to SIS_PROP.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f24..e39de86 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6257,6 +6257,36 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_CLUSTER +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i = -1, cpu; + + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + cpumask_clear_cpu(target, cpus); + + for_each_cpu_wrap(cpu, cpus, target + 1) { + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + i = __select_idle_cpu(cpu, p); + if ((unsigned int)i < nr_cpumask_bits) + break; + } + } + + return i; +} +#else +static inline int scan_cluster(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target, int *idle_cpu) +{ + return -1; +} +#endif + /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the @@ -6268,15 +6298,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool int i, cpu, idle_cpu = -1, nr = INT_MAX; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd, *cluster_sd; u64 time = 0;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1;
- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + /* scan cluster before scanning LLC */ + cluster_sd = rcu_dereference(per_cpu(sd_cluster, target)); + if (cluster_sd) { + i = scan_cluster(p, cluster_sd, has_idle_core, target, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + }
+ /* scan LLC excluding cluster */ + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (cluster_sd) + cpumask_andnot(cpus, cpus, sched_domain_span(cluster_sd)); + if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies;