Skip to content

Commit 33cf66d

Browse files
author
Peter Zijlstra
committed
sched/fair: Proportional newidle balance
Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Tested-by: Chris Mason <clm@meta.com> Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org
1 parent 08d473d commit 33cf66d

6 files changed

Lines changed: 64 additions & 4 deletions

File tree

include/linux/sched/topology.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,9 @@ struct sched_domain {
9292
unsigned int nr_balance_failed; /* initialise to 0 */
9393

9494
/* idle_balance() stats */
95+
unsigned int newidle_call;
96+
unsigned int newidle_success;
97+
unsigned int newidle_ratio;
9598
u64 max_newidle_lb_cost;
9699
unsigned long last_decay_max_lb_cost;
97100

kernel/sched/core.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
121121
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
122122

123123
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
124+
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
124125

125126
#ifdef CONFIG_SCHED_PROXY_EXEC
126127
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -8489,6 +8490,8 @@ void __init sched_init_smp(void)
84898490
{
84908491
sched_init_numa(NUMA_NO_NODE);
84918492

8493+
prandom_init_once(&sched_rnd_state);
8494+
84928495
/*
84938496
* There's no userspace yet to cause hotplug operations; hence all the
84948497
* CPU masks are stable and all blatant races in the below code cannot

kernel/sched/fair.c

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12224,11 +12224,27 @@ void update_max_interval(void)
1222412224
max_load_balance_interval = HZ*num_online_cpus()/10;
1222512225
}
1222612226

12227-
static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
12227+
static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
12228+
{
12229+
sd->newidle_call++;
12230+
sd->newidle_success += success;
12231+
12232+
if (sd->newidle_call >= 1024) {
12233+
sd->newidle_ratio = sd->newidle_success;
12234+
sd->newidle_call /= 2;
12235+
sd->newidle_success /= 2;
12236+
}
12237+
}
12238+
12239+
static inline bool
12240+
update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
1222812241
{
1222912242
unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
1223012243
unsigned long now = jiffies;
1223112244

12245+
if (cost)
12246+
update_newidle_stats(sd, success);
12247+
1223212248
if (cost > sd->max_newidle_lb_cost) {
1223312249
/*
1223412250
* Track max cost of a domain to make sure to not delay the
@@ -12276,7 +12292,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
1227612292
* Decay the newidle max times here because this is a regular
1227712293
* visit to all the domains.
1227812294
*/
12279-
need_decay = update_newidle_cost(sd, 0);
12295+
need_decay = update_newidle_cost(sd, 0, 0);
1228012296
max_cost += sd->max_newidle_lb_cost;
1228112297

1228212298
/*
@@ -12912,17 +12928,37 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
1291212928
break;
1291312929

1291412930
if (sd->flags & SD_BALANCE_NEWIDLE) {
12931+
unsigned int weight = 1;
12932+
12933+
if (sched_feat(NI_RANDOM)) {
12934+
/*
12935+
* Throw a 1k sided dice; and only run
12936+
* newidle_balance according to the success
12937+
* rate.
12938+
*/
12939+
u32 d1k = sched_rng() % 1024;
12940+
weight = 1 + sd->newidle_ratio;
12941+
if (d1k > weight) {
12942+
update_newidle_stats(sd, 0);
12943+
continue;
12944+
}
12945+
weight = (1024 + weight/2) / weight;
12946+
}
1291512947

1291612948
pulled_task = sched_balance_rq(this_cpu, this_rq,
1291712949
sd, CPU_NEWLY_IDLE,
1291812950
&continue_balancing);
1291912951

1292012952
t1 = sched_clock_cpu(this_cpu);
1292112953
domain_cost = t1 - t0;
12922-
update_newidle_cost(sd, domain_cost);
12923-
1292412954
curr_cost += domain_cost;
1292512955
t0 = t1;
12956+
12957+
/*
12958+
* Track max cost of a domain to make sure to not delay the
12959+
* next wakeup on the CPU.
12960+
*/
12961+
update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
1292612962
}
1292712963

1292812964
/*

kernel/sched/features.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true)
121121
SCHED_FEAT(UTIL_EST, true)
122122

123123
SCHED_FEAT(LATENCY_WARN, false)
124+
125+
/*
126+
* Do newidle balancing proportional to its success rate using randomization.
127+
*/
128+
SCHED_FEAT(NI_RANDOM, true)

kernel/sched/sched.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#ifndef _KERNEL_SCHED_SCHED_H
66
#define _KERNEL_SCHED_SCHED_H
77

8+
#include <linux/prandom.h>
89
#include <linux/sched/affinity.h>
910
#include <linux/sched/autogroup.h>
1011
#include <linux/sched/cpufreq.h>
@@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p)
13481349
}
13491350

13501351
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
1352+
DECLARE_PER_CPU(struct rnd_state, sched_rnd_state);
1353+
1354+
static inline u32 sched_rng(void)
1355+
{
1356+
return prandom_u32_state(this_cpu_ptr(&sched_rnd_state));
1357+
}
13511358

13521359
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
13531360
#define this_rq() this_cpu_ptr(&runqueues)

kernel/sched/topology.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl,
16691669

16701670
.last_balance = jiffies,
16711671
.balance_interval = sd_weight,
1672+
1673+
/* 50% success rate */
1674+
.newidle_call = 512,
1675+
.newidle_success = 256,
1676+
.newidle_ratio = 512,
1677+
16721678
.max_newidle_lb_cost = 0,
16731679
.last_decay_max_lb_cost = jiffies,
16741680
.child = child,

0 commit comments

Comments
 (0)