1 From 284c17af888771544b54c759f36f2c4c9b488560 Mon Sep 17 00:00:00 2001
2 From: Steven Rostedt <srostedt@redhat.com>
3 Date: Mon, 16 Jul 2012 08:07:43 +0000
4 Subject: [PATCH 219/366] cpu/rt: Rework cpu down for PREEMPT_RT
6 Bringing a CPU down is a pain with the PREEMPT_RT kernel because
7 tasks can be preempted in many more places than in non-RT. In
8 order to handle per_cpu variables, tasks may be pinned to a CPU
9 for a while, and even sleep. But these tasks need to be off the CPU
10 if that CPU is going down.
12 Several synchronization methods have been tried, but when stressed
13 they failed. This is a new approach.
15 A sync_tsk thread is still created and tasks may still block on a
16 lock when the CPU is going down, but how that works is a bit different.
17 When cpu_down() starts, it will create the sync_tsk and wait on it
18 to inform that current tasks that are pinned on the CPU are no longer
19 pinned. But new tasks that are about to be pinned will still be allowed
20 to do so at this time.
22 Then the notifiers are called. Several notifiers will bring down tasks
23 that will enter these locations. Some of these tasks will take locks
24 of other tasks that are on the CPU. If we don't let those other tasks
25 continue, but make them block until CPU down is done, the tasks that
26 the notifiers are waiting on will never complete as they are waiting
27 for the locks held by the tasks that are blocked.
29 Thus we still let the task pin the CPU until the notifiers are done.
30 After the notifiers run, we then make new tasks entering the pinned
31 CPU sections grab a mutex and wait. This mutex is now a per CPU mutex
32 in the hotplug_pcp descriptor.
34 To help things along, a new function in the scheduler code is created
35 called migrate_me(). This function will try to migrate the current task
36 off the CPU this is going down if possible. When the sync_tsk is created,
37 all tasks will then try to migrate off the CPU going down. There are
38 several cases that this wont work, but it helps in most cases.
40 After the notifiers are called and if a task can't migrate off but enters
41 the pin CPU sections, it will be forced to wait on the hotplug_pcp mutex
42 until the CPU down is complete. Then the scheduler will force the migration
45 Also, I found that THREAD_BOUND need to also be accounted for in the
46 pinned CPU, and the migrate_disable no longer treats them special.
47 This helps fix issues with ksoftirqd and workqueue that unbind on CPU down.
49 Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
50 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
52 include/linux/sched.h | 7 ++
53 kernel/cpu.c | 240 +++++++++++++++++++++++++++++++++++++++++---------
54 kernel/sched/core.c | 82 ++++++++++++++++-
55 3 files changed, 283 insertions(+), 46 deletions(-)
57 diff --git a/include/linux/sched.h b/include/linux/sched.h
58 index e980cf1..787f50a 100644
59 --- a/include/linux/sched.h
60 +++ b/include/linux/sched.h
61 @@ -2299,6 +2299,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
63 extern int set_cpus_allowed_ptr(struct task_struct *p,
64 const struct cpumask *new_mask);
65 +int migrate_me(void);
66 +void tell_sched_cpu_down_begin(int cpu);
67 +void tell_sched_cpu_down_done(int cpu);
70 static inline void do_set_cpus_allowed(struct task_struct *p,
71 const struct cpumask *new_mask)
72 @@ -2311,6 +2315,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
76 +static inline int migrate_me(void) { return 0; }
77 +static inline void tell_sched_cpu_down_begin(int cpu) { }
78 +static inline void tell_sched_cpu_down_done(int cpu) { }
81 #ifdef CONFIG_NO_HZ_COMMON
82 diff --git a/kernel/cpu.c b/kernel/cpu.c
83 index 5d8be91..d3e2928 100644
86 @@ -62,16 +62,10 @@ static int cpu_hotplug_disabled;
89 struct task_struct *active_writer;
91 /* wait queue to wake up the active_writer */
93 -#ifdef CONFIG_PREEMPT_RT_FULL
94 - /* Makes the lock keep the task's state */
97 /* verifies that no writer will get active while readers are active */
101 * Also blocks the new readers during
102 * an ongoing cpu hotplug operation.
103 @@ -83,27 +77,13 @@ static struct {
106 .active_writer = NULL,
107 - .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
108 -#ifdef CONFIG_PREEMPT_RT_FULL
109 - .lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
111 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
113 + .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
114 #ifdef CONFIG_DEBUG_LOCK_ALLOC
115 .dep_map = {.name = "cpu_hotplug.lock" },
119 -#ifdef CONFIG_PREEMPT_RT_FULL
120 -# define hotplug_lock() rt_spin_lock(&cpu_hotplug.lock)
121 -# define hotplug_trylock() rt_spin_trylock(&cpu_hotplug.lock)
122 -# define hotplug_unlock() rt_spin_unlock(&cpu_hotplug.lock)
124 -# define hotplug_lock() mutex_lock(&cpu_hotplug.lock)
125 -# define hotplug_trylock() mutex_trylock(&cpu_hotplug.lock)
126 -# define hotplug_unlock() mutex_unlock(&cpu_hotplug.lock)
129 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
130 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
131 #define cpuhp_lock_acquire_tryread() \
132 @@ -111,12 +91,42 @@ static struct {
133 #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
134 #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
137 + * hotplug_pcp - per cpu hotplug descriptor
138 + * @unplug: set when pin_current_cpu() needs to sync tasks
139 + * @sync_tsk: the task that waits for tasks to finish pinned sections
140 + * @refcount: counter of tasks in pinned sections
141 + * @grab_lock: set when the tasks entering pinned sections should wait
142 + * @synced: notifier for @sync_tsk to tell cpu_down it's finished
143 + * @mutex: the mutex to make tasks wait (used when @grab_lock is true)
144 + * @mutex_init: zero if the mutex hasn't been initialized yet.
146 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
147 + * is used as a flag and still exists after @sync_tsk has exited and
148 + * @sync_tsk set to NULL.
151 struct task_struct *unplug;
152 + struct task_struct *sync_tsk;
155 struct completion synced;
156 +#ifdef CONFIG_PREEMPT_RT_FULL
159 + struct mutex mutex;
164 +#ifdef CONFIG_PREEMPT_RT_FULL
165 +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
166 +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
168 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
169 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
172 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
175 @@ -130,18 +140,39 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
176 void pin_current_cpu(void)
178 struct hotplug_pcp *hp;
182 hp = this_cpu_ptr(&hotplug_pcp);
184 - if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
185 + if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
186 hp->unplug == current) {
193 + if (hp->grab_lock) {
196 + hotplug_unlock(hp);
200 + * Try to push this task off of this CPU.
202 + if (!migrate_me()) {
204 + hp = this_cpu_ptr(&hotplug_pcp);
205 + if (!hp->grab_lock) {
207 + * Just let it continue it's already pinned
208 + * or about to sleep.
219 @@ -162,26 +193,84 @@ void unpin_current_cpu(void)
220 wake_up_process(hp->unplug);
224 - * FIXME: Is this really correct under all circumstances ?
226 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
228 + set_current_state(TASK_UNINTERRUPTIBLE);
229 + while (hp->refcount) {
230 + schedule_preempt_disabled();
231 + set_current_state(TASK_UNINTERRUPTIBLE);
235 static int sync_unplug_thread(void *data)
237 struct hotplug_pcp *hp = data;
240 hp->unplug = current;
241 + wait_for_pinned_cpus(hp);
244 + * This thread will synchronize the cpu_down() with threads
245 + * that have pinned the CPU. When the pinned CPU count reaches
246 + * zero, we inform the cpu_down code to continue to the next step.
248 set_current_state(TASK_UNINTERRUPTIBLE);
249 - while (hp->refcount) {
250 - schedule_preempt_disabled();
252 + complete(&hp->synced);
255 + * If all succeeds, the next step will need tasks to wait till
256 + * the CPU is offline before continuing. To do this, the grab_lock
257 + * is set and tasks going into pin_current_cpu() will block on the
258 + * mutex. But we still need to wait for those that are already in
259 + * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
260 + * will kick this thread out.
262 + while (!hp->grab_lock && !kthread_should_stop()) {
264 + set_current_state(TASK_UNINTERRUPTIBLE);
267 + /* Make sure grab_lock is seen before we see a stale completion */
271 + * Now just before cpu_down() enters stop machine, we need to make
272 + * sure all tasks that are in pinned CPU sections are out, and new
273 + * tasks will now grab the lock, keeping them from entering pinned
276 + if (!kthread_should_stop()) {
278 + wait_for_pinned_cpus(hp);
280 + complete(&hp->synced);
283 + set_current_state(TASK_UNINTERRUPTIBLE);
284 + while (!kthread_should_stop()) {
286 set_current_state(TASK_UNINTERRUPTIBLE);
288 set_current_state(TASK_RUNNING);
290 - complete(&hp->synced);
293 + * Force this thread off this CPU as it's going down and
294 + * we don't want any more work on this CPU.
296 + current->flags &= ~PF_NO_SETAFFINITY;
297 + do_set_cpus_allowed(current, cpu_present_mask);
302 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
304 + wake_up_process(hp->sync_tsk);
305 + wait_for_completion(&hp->synced);
309 * Start the sync_unplug_thread on the target cpu and wait for it to
311 @@ -189,23 +278,83 @@ static int sync_unplug_thread(void *data)
312 static int cpu_unplug_begin(unsigned int cpu)
314 struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
315 - struct task_struct *tsk;
318 + /* Protected by cpu_hotplug.lock */
319 + if (!hp->mutex_init) {
320 +#ifdef CONFIG_PREEMPT_RT_FULL
321 + spin_lock_init(&hp->lock);
323 + mutex_init(&hp->mutex);
325 + hp->mutex_init = 1;
328 + /* Inform the scheduler to migrate tasks off this CPU */
329 + tell_sched_cpu_down_begin(cpu);
331 init_completion(&hp->synced);
332 - tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
334 - return (PTR_ERR(tsk));
335 - kthread_bind(tsk, cpu);
336 - wake_up_process(tsk);
337 - wait_for_completion(&hp->synced);
339 + hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
340 + if (IS_ERR(hp->sync_tsk)) {
341 + err = PTR_ERR(hp->sync_tsk);
342 + hp->sync_tsk = NULL;
345 + kthread_bind(hp->sync_tsk, cpu);
348 + * Wait for tasks to get out of the pinned sections,
349 + * it's still OK if new tasks enter. Some CPU notifiers will
350 + * wait for tasks that are going to enter these sections and
351 + * we must not have them block.
353 + __cpu_unplug_sync(hp);
358 +static void cpu_unplug_sync(unsigned int cpu)
360 + struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
362 + init_completion(&hp->synced);
363 + /* The completion needs to be initialzied before setting grab_lock */
366 + /* Grab the mutex before setting grab_lock */
371 + * The CPU notifiers have been completed.
372 + * Wait for tasks to get out of pinned CPU sections and have new
373 + * tasks block until the CPU is completely down.
375 + __cpu_unplug_sync(hp);
377 + /* All done with the sync thread */
378 + kthread_stop(hp->sync_tsk);
379 + hp->sync_tsk = NULL;
382 static void cpu_unplug_done(unsigned int cpu)
384 struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
387 + /* Let all tasks know cpu unplug is finished before cleaning up */
391 + kthread_stop(hp->sync_tsk);
393 + if (hp->grab_lock) {
394 + hotplug_unlock(hp);
395 + /* protected by cpu_hotplug.lock */
398 + tell_sched_cpu_down_done(cpu);
401 void get_online_cpus(void)
402 @@ -214,9 +363,9 @@ void get_online_cpus(void)
403 if (cpu_hotplug.active_writer == current)
405 cpuhp_lock_acquire_read();
407 + mutex_lock(&cpu_hotplug.lock);
408 atomic_inc(&cpu_hotplug.refcount);
410 + mutex_unlock(&cpu_hotplug.lock);
412 EXPORT_SYMBOL_GPL(get_online_cpus);
414 @@ -269,11 +418,11 @@ void cpu_hotplug_begin(void)
415 cpuhp_lock_acquire();
419 + mutex_lock(&cpu_hotplug.lock);
420 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
421 if (likely(!atomic_read(&cpu_hotplug.refcount)))
424 + mutex_unlock(&cpu_hotplug.lock);
427 finish_wait(&cpu_hotplug.wq, &wait);
428 @@ -282,7 +431,7 @@ void cpu_hotplug_begin(void)
429 void cpu_hotplug_done(void)
431 cpu_hotplug.active_writer = NULL;
433 + mutex_unlock(&cpu_hotplug.lock);
434 cpuhp_lock_release();
437 @@ -519,6 +668,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
439 smpboot_park_threads(cpu);
441 + /* Notifiers are done. Don't let any more tasks pin this CPU. */
442 + cpu_unplug_sync(cpu);
445 * Prevent irq alloc/free while the dying cpu reorganizes the
446 * interrupt affinities.
447 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
448 index 41814f6..0f7a78d 100644
449 --- a/kernel/sched/core.c
450 +++ b/kernel/sched/core.c
451 @@ -1224,6 +1224,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
452 enqueue_task(rq, p, ENQUEUE_RESTORE);
455 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
456 +static DEFINE_MUTEX(sched_down_mutex);
457 +static cpumask_t sched_down_cpumask;
459 +void tell_sched_cpu_down_begin(int cpu)
461 + mutex_lock(&sched_down_mutex);
462 + cpumask_set_cpu(cpu, &sched_down_cpumask);
463 + mutex_unlock(&sched_down_mutex);
466 +void tell_sched_cpu_down_done(int cpu)
468 + mutex_lock(&sched_down_mutex);
469 + cpumask_clear_cpu(cpu, &sched_down_cpumask);
470 + mutex_unlock(&sched_down_mutex);
474 + * migrate_me - try to move the current task off this cpu
476 + * Used by the pin_current_cpu() code to try to get tasks
477 + * to move off the current CPU as it is going down.
478 + * It will only move the task if the task isn't pinned to
479 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
480 + * and the task has to be in a RUNNING state. Otherwise the
481 + * movement of the task will wake it up (change its state
482 + * to running) when the task did not expect it.
484 + * Returns 1 if it succeeded in moving the current task
487 +int migrate_me(void)
489 + struct task_struct *p = current;
490 + struct migration_arg arg;
491 + struct cpumask *cpumask;
492 + struct cpumask *mask;
493 + unsigned long flags;
494 + unsigned int dest_cpu;
498 + * We can not migrate tasks bounded to a CPU or tasks not
499 + * running. The movement of the task will wake it up.
501 + if (p->flags & PF_NO_SETAFFINITY || p->state)
504 + mutex_lock(&sched_down_mutex);
505 + rq = task_rq_lock(p, &flags);
507 + cpumask = this_cpu_ptr(&sched_cpumasks);
508 + mask = &p->cpus_allowed;
510 + cpumask_andnot(cpumask, mask, &sched_down_cpumask);
512 + if (!cpumask_weight(cpumask)) {
513 + /* It's only on this CPU? */
514 + task_rq_unlock(rq, p, &flags);
515 + mutex_unlock(&sched_down_mutex);
519 + dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
522 + arg.dest_cpu = dest_cpu;
524 + task_rq_unlock(rq, p, &flags);
526 + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
527 + tlb_migrate_finish(p->mm);
528 + mutex_unlock(&sched_down_mutex);
534 * Change a given task's CPU affinity. Migrate the thread to a
535 * proper CPU and schedule it away if the CPU it's executing on
536 @@ -3122,7 +3200,7 @@ void migrate_disable(void)
538 struct task_struct *p = current;
540 - if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
542 #ifdef CONFIG_SCHED_DEBUG
543 p->migrate_disable_atomic++;
545 @@ -3155,7 +3233,7 @@ void migrate_enable(void)
549 - if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
551 #ifdef CONFIG_SCHED_DEBUG
552 p->migrate_disable_atomic--;