rt-patches/0219-cpu-rt-Rework-cpu-down-for-PREEMPT_RT.patch

   1 From 284c17af888771544b54c759f36f2c4c9b488560 Mon Sep 17 00:00:00 2001
   2 From: Steven Rostedt <srostedt@redhat.com>
   3 Date: Mon, 16 Jul 2012 08:07:43 +0000
   4 Subject: [PATCH 219/366] cpu/rt: Rework cpu down for PREEMPT_RT
   5
   6 Bringing a CPU down is a pain with the PREEMPT_RT kernel because
   7 tasks can be preempted in many more places than in non-RT. In
   8 order to handle per_cpu variables, tasks may be pinned to a CPU
   9 for a while, and even sleep. But these tasks need to be off the CPU
  10 if that CPU is going down.
  11
  12 Several synchronization methods have been tried, but when stressed
  13 they failed. This is a new approach.
  14
  15 A sync_tsk thread is still created and tasks may still block on a
  16 lock when the CPU is going down, but how that works is a bit different.
  17 When cpu_down() starts, it will create the sync_tsk and wait on it
  18 to inform that current tasks that are pinned on the CPU are no longer
  19 pinned. But new tasks that are about to be pinned will still be allowed
  20 to do so at this time.
  21
  22 Then the notifiers are called. Several notifiers will bring down tasks
  23 that will enter these locations. Some of these tasks will take locks
  24 of other tasks that are on the CPU. If we don't let those other tasks
  25 continue, but make them block until CPU down is done, the tasks that
  26 the notifiers are waiting on will never complete as they are waiting
  27 for the locks held by the tasks that are blocked.
  28
  29 Thus we still let the task pin the CPU until the notifiers are done.
  30 After the notifiers run, we then make new tasks entering the pinned
  31 CPU sections grab a mutex and wait. This mutex is now a per CPU mutex
  32 in the hotplug_pcp descriptor.
  33
  34 To help things along, a new function in the scheduler code is created
  35 called migrate_me(). This function will try to migrate the current task
  36 off the CPU this is going down if possible. When the sync_tsk is created,
  37 all tasks will then try to migrate off the CPU going down. There are
  38 several cases that this wont work, but it helps in most cases.
  39
  40 After the notifiers are called and if a task can't migrate off but enters
  41 the pin CPU sections, it will be forced to wait on the hotplug_pcp mutex
  42 until the CPU down is complete. Then the scheduler will force the migration
  43 anyway.
  44
  45 Also, I found that THREAD_BOUND need to also be accounted for in the
  46 pinned CPU, and the migrate_disable no longer treats them special.
  47 This helps fix issues with ksoftirqd and workqueue that unbind on CPU down.
  48
  49 Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
  50 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
  51 ---
  52  include/linux/sched.h |   7 ++
  53  kernel/cpu.c          | 240 +++++++++++++++++++++++++++++++++++++++++---------
  54  kernel/sched/core.c   |  82 ++++++++++++++++-
  55  3 files changed, 283 insertions(+), 46 deletions(-)
  56
  57 diff --git a/include/linux/sched.h b/include/linux/sched.h
  58 index e980cf1..787f50a 100644
  59 --- a/include/linux/sched.h
  60 +++ b/include/linux/sched.h
  61 @@ -2299,6 +2299,10 @@ extern void do_set_cpus_allowed(struct task_struct *p,
  62
  63  extern int set_cpus_allowed_ptr(struct task_struct *p,
  64                                 const struct cpumask *new_mask);
  65 +int migrate_me(void);
  66 +void tell_sched_cpu_down_begin(int cpu);
  67 +void tell_sched_cpu_down_done(int cpu);
  68 +
  69  #else
  70  static inline void do_set_cpus_allowed(struct task_struct *p,
  71                                       const struct cpumask *new_mask)
  72 @@ -2311,6 +2315,9 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
  73                 return -EINVAL;
  74         return 0;
  75  }
  76 +static inline int migrate_me(void) { return 0; }
  77 +static inline void tell_sched_cpu_down_begin(int cpu) { }
  78 +static inline void tell_sched_cpu_down_done(int cpu) { }
  79  #endif
  80
  81  #ifdef CONFIG_NO_HZ_COMMON
  82 diff --git a/kernel/cpu.c b/kernel/cpu.c
  83 index 5d8be91..d3e2928 100644
  84 --- a/kernel/cpu.c
  85 +++ b/kernel/cpu.c
  86 @@ -62,16 +62,10 @@ static int cpu_hotplug_disabled;
  87
  88  static struct {
  89         struct task_struct *active_writer;
  90 -
  91         /* wait queue to wake up the active_writer */
  92         wait_queue_head_t wq;
  93 -#ifdef CONFIG_PREEMPT_RT_FULL
  94 -       /* Makes the lock keep the task's state */
  95 -       spinlock_t lock;
  96 -#else
  97         /* verifies that no writer will get active while readers are active */
  98         struct mutex lock;
  99 -#endif
 100         /*
 101          * Also blocks the new readers during
 102          * an ongoing cpu hotplug operation.
 103 @@ -83,27 +77,13 @@ static struct {
 104  #endif
 105  } cpu_hotplug = {
 106         .active_writer = NULL,
 107 -       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
 108 -#ifdef CONFIG_PREEMPT_RT_FULL
 109 -       .lock = __SPIN_LOCK_UNLOCKED(cpu_hotplug.lock),
 110 -#else
 111         .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 112 -#endif
 113 +       .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
 114  #ifdef CONFIG_DEBUG_LOCK_ALLOC
 115         .dep_map = {.name = "cpu_hotplug.lock" },
 116  #endif
 117  };
 118
 119 -#ifdef CONFIG_PREEMPT_RT_FULL
 120 -# define hotplug_lock()                rt_spin_lock(&cpu_hotplug.lock)
 121 -# define hotplug_trylock()     rt_spin_trylock(&cpu_hotplug.lock)
 122 -# define hotplug_unlock()      rt_spin_unlock(&cpu_hotplug.lock)
 123 -#else
 124 -# define hotplug_lock()                mutex_lock(&cpu_hotplug.lock)
 125 -# define hotplug_trylock()     mutex_trylock(&cpu_hotplug.lock)
 126 -# define hotplug_unlock()      mutex_unlock(&cpu_hotplug.lock)
 127 -#endif
 128 -
 129  /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
 130  #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
 131  #define cpuhp_lock_acquire_tryread() \
 132 @@ -111,12 +91,42 @@ static struct {
 133  #define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
 134  #define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
 135
 136 +/**
 137 + * hotplug_pcp - per cpu hotplug descriptor
 138 + * @unplug:    set when pin_current_cpu() needs to sync tasks
 139 + * @sync_tsk:  the task that waits for tasks to finish pinned sections
 140 + * @refcount:  counter of tasks in pinned sections
 141 + * @grab_lock: set when the tasks entering pinned sections should wait
 142 + * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
 143 + * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
 144 + * @mutex_init:        zero if the mutex hasn't been initialized yet.
 145 + *
 146 + * Although @unplug and @sync_tsk may point to the same task, the @unplug
 147 + * is used as a flag and still exists after @sync_tsk has exited and
 148 + * @sync_tsk set to NULL.
 149 + */
 150  struct hotplug_pcp {
 151         struct task_struct *unplug;
 152 +       struct task_struct *sync_tsk;
 153         int refcount;
 154 +       int grab_lock;
 155         struct completion synced;
 156 +#ifdef CONFIG_PREEMPT_RT_FULL
 157 +       spinlock_t lock;
 158 +#else
 159 +       struct mutex mutex;
 160 +#endif
 161 +       int mutex_init;
 162  };
 163
 164 +#ifdef CONFIG_PREEMPT_RT_FULL
 165 +# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
 166 +# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
 167 +#else
 168 +# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
 169 +# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
 170 +#endif
 171 +
 172  static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 173
 174  /**
 175 @@ -130,18 +140,39 @@ static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 176  void pin_current_cpu(void)
 177  {
 178         struct hotplug_pcp *hp;
 179 +       int force = 0;
 180
 181  retry:
 182         hp = this_cpu_ptr(&hotplug_pcp);
 183
 184 -       if (!hp->unplug || hp->refcount || preempt_count() > 1 ||
 185 +       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
 186             hp->unplug == current) {
 187                 hp->refcount++;
 188                 return;
 189         }
 190 -       preempt_enable();
 191 -       hotplug_lock();
 192 -       hotplug_unlock();
 193 +       if (hp->grab_lock) {
 194 +               preempt_enable();
 195 +               hotplug_lock(hp);
 196 +               hotplug_unlock(hp);
 197 +       } else {
 198 +               preempt_enable();
 199 +               /*
 200 +                * Try to push this task off of this CPU.
 201 +                */
 202 +               if (!migrate_me()) {
 203 +                       preempt_disable();
 204 +                       hp = this_cpu_ptr(&hotplug_pcp);
 205 +                       if (!hp->grab_lock) {
 206 +                               /*
 207 +                                * Just let it continue it's already pinned
 208 +                                * or about to sleep.
 209 +                                */
 210 +                               force = 1;
 211 +                               goto retry;
 212 +                       }
 213 +                       preempt_enable();
 214 +               }
 215 +       }
 216         preempt_disable();
 217         goto retry;
 218  }
 219 @@ -162,26 +193,84 @@ void unpin_current_cpu(void)
 220                 wake_up_process(hp->unplug);
 221  }
 222
 223 -/*
 224 - * FIXME: Is this really correct under all circumstances ?
 225 - */
 226 +static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
 227 +{
 228 +       set_current_state(TASK_UNINTERRUPTIBLE);
 229 +       while (hp->refcount) {
 230 +               schedule_preempt_disabled();
 231 +               set_current_state(TASK_UNINTERRUPTIBLE);
 232 +       }
 233 +}
 234 +
 235  static int sync_unplug_thread(void *data)
 236  {
 237         struct hotplug_pcp *hp = data;
 238
 239         preempt_disable();
 240         hp->unplug = current;
 241 +       wait_for_pinned_cpus(hp);
 242 +
 243 +       /*
 244 +        * This thread will synchronize the cpu_down() with threads
 245 +        * that have pinned the CPU. When the pinned CPU count reaches
 246 +        * zero, we inform the cpu_down code to continue to the next step.
 247 +        */
 248         set_current_state(TASK_UNINTERRUPTIBLE);
 249 -       while (hp->refcount) {
 250 -               schedule_preempt_disabled();
 251 +       preempt_enable();
 252 +       complete(&hp->synced);
 253 +
 254 +       /*
 255 +        * If all succeeds, the next step will need tasks to wait till
 256 +        * the CPU is offline before continuing. To do this, the grab_lock
 257 +        * is set and tasks going into pin_current_cpu() will block on the
 258 +        * mutex. But we still need to wait for those that are already in
 259 +        * pinned CPU sections. If the cpu_down() failed, the kthread_should_stop()
 260 +        * will kick this thread out.
 261 +        */
 262 +       while (!hp->grab_lock && !kthread_should_stop()) {
 263 +               schedule();
 264 +               set_current_state(TASK_UNINTERRUPTIBLE);
 265 +       }
 266 +
 267 +       /* Make sure grab_lock is seen before we see a stale completion */
 268 +       smp_mb();
 269 +
 270 +       /*
 271 +        * Now just before cpu_down() enters stop machine, we need to make
 272 +        * sure all tasks that are in pinned CPU sections are out, and new
 273 +        * tasks will now grab the lock, keeping them from entering pinned
 274 +        * CPU sections.
 275 +        */
 276 +       if (!kthread_should_stop()) {
 277 +               preempt_disable();
 278 +               wait_for_pinned_cpus(hp);
 279 +               preempt_enable();
 280 +               complete(&hp->synced);
 281 +       }
 282 +
 283 +       set_current_state(TASK_UNINTERRUPTIBLE);
 284 +       while (!kthread_should_stop()) {
 285 +               schedule();
 286                 set_current_state(TASK_UNINTERRUPTIBLE);
 287         }
 288         set_current_state(TASK_RUNNING);
 289 -       preempt_enable();
 290 -       complete(&hp->synced);
 291 +
 292 +       /*
 293 +        * Force this thread off this CPU as it's going down and
 294 +        * we don't want any more work on this CPU.
 295 +        */
 296 +       current->flags &= ~PF_NO_SETAFFINITY;
 297 +       do_set_cpus_allowed(current, cpu_present_mask);
 298 +       migrate_me();
 299         return 0;
 300  }
 301
 302 +static void __cpu_unplug_sync(struct hotplug_pcp *hp)
 303 +{
 304 +       wake_up_process(hp->sync_tsk);
 305 +       wait_for_completion(&hp->synced);
 306 +}
 307 +
 308  /*
 309   * Start the sync_unplug_thread on the target cpu and wait for it to
 310   * complete.
 311 @@ -189,23 +278,83 @@ static int sync_unplug_thread(void *data)
 312  static int cpu_unplug_begin(unsigned int cpu)
 313  {
 314         struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 315 -       struct task_struct *tsk;
 316 +       int err;
 317 +
 318 +       /* Protected by cpu_hotplug.lock */
 319 +       if (!hp->mutex_init) {
 320 +#ifdef CONFIG_PREEMPT_RT_FULL
 321 +               spin_lock_init(&hp->lock);
 322 +#else
 323 +               mutex_init(&hp->mutex);
 324 +#endif
 325 +               hp->mutex_init = 1;
 326 +       }
 327 +
 328 +       /* Inform the scheduler to migrate tasks off this CPU */
 329 +       tell_sched_cpu_down_begin(cpu);
 330
 331         init_completion(&hp->synced);
 332 -       tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 333 -       if (IS_ERR(tsk))
 334 -               return (PTR_ERR(tsk));
 335 -       kthread_bind(tsk, cpu);
 336 -       wake_up_process(tsk);
 337 -       wait_for_completion(&hp->synced);
 338 +
 339 +       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", cpu);
 340 +       if (IS_ERR(hp->sync_tsk)) {
 341 +               err = PTR_ERR(hp->sync_tsk);
 342 +               hp->sync_tsk = NULL;
 343 +               return err;
 344 +       }
 345 +       kthread_bind(hp->sync_tsk, cpu);
 346 +
 347 +       /*
 348 +        * Wait for tasks to get out of the pinned sections,
 349 +        * it's still OK if new tasks enter. Some CPU notifiers will
 350 +        * wait for tasks that are going to enter these sections and
 351 +        * we must not have them block.
 352 +        */
 353 +       __cpu_unplug_sync(hp);
 354 +
 355         return 0;
 356  }
 357
 358 +static void cpu_unplug_sync(unsigned int cpu)
 359 +{
 360 +       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 361 +
 362 +       init_completion(&hp->synced);
 363 +       /* The completion needs to be initialzied before setting grab_lock */
 364 +       smp_wmb();
 365 +
 366 +       /* Grab the mutex before setting grab_lock */
 367 +       hotplug_lock(hp);
 368 +       hp->grab_lock = 1;
 369 +
 370 +       /*
 371 +        * The CPU notifiers have been completed.
 372 +        * Wait for tasks to get out of pinned CPU sections and have new
 373 +        * tasks block until the CPU is completely down.
 374 +        */
 375 +       __cpu_unplug_sync(hp);
 376 +
 377 +       /* All done with the sync thread */
 378 +       kthread_stop(hp->sync_tsk);
 379 +       hp->sync_tsk = NULL;
 380 +}
 381 +
 382  static void cpu_unplug_done(unsigned int cpu)
 383  {
 384         struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 385
 386         hp->unplug = NULL;
 387 +       /* Let all tasks know cpu unplug is finished before cleaning up */
 388 +       smp_wmb();
 389 +
 390 +       if (hp->sync_tsk)
 391 +               kthread_stop(hp->sync_tsk);
 392 +
 393 +       if (hp->grab_lock) {
 394 +               hotplug_unlock(hp);
 395 +               /* protected by cpu_hotplug.lock */
 396 +               hp->grab_lock = 0;
 397 +       }
 398 +       tell_sched_cpu_down_done(cpu);
 399  }
 400
 401  void get_online_cpus(void)
 402 @@ -214,9 +363,9 @@ void get_online_cpus(void)
 403         if (cpu_hotplug.active_writer == current)
 404                 return;
 405         cpuhp_lock_acquire_read();
 406 -       hotplug_lock();
 407 +       mutex_lock(&cpu_hotplug.lock);
 408         atomic_inc(&cpu_hotplug.refcount);
 409 -       hotplug_unlock();
 410 +       mutex_unlock(&cpu_hotplug.lock);
 411  }
 412  EXPORT_SYMBOL_GPL(get_online_cpus);
 413
 414 @@ -269,11 +418,11 @@ void cpu_hotplug_begin(void)
 415         cpuhp_lock_acquire();
 416
 417         for (;;) {
 418 -               hotplug_lock();
 419 +               mutex_lock(&cpu_hotplug.lock);
 420                 prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
 421                 if (likely(!atomic_read(&cpu_hotplug.refcount)))
 422                                 break;
 423 -               hotplug_unlock();
 424 +               mutex_unlock(&cpu_hotplug.lock);
 425                 schedule();
 426         }
 427         finish_wait(&cpu_hotplug.wq, &wait);
 428 @@ -282,7 +431,7 @@ void cpu_hotplug_begin(void)
 429  void cpu_hotplug_done(void)
 430  {
 431         cpu_hotplug.active_writer = NULL;
 432 -       hotplug_unlock();
 433 +       mutex_unlock(&cpu_hotplug.lock);
 434         cpuhp_lock_release();
 435  }
 436
 437 @@ -519,6 +668,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 438
 439         smpboot_park_threads(cpu);
 440
 441 +       /* Notifiers are done. Don't let any more tasks pin this CPU. */
 442 +       cpu_unplug_sync(cpu);
 443 +
 444         /*
 445          * Prevent irq alloc/free while the dying cpu reorganizes the
 446          * interrupt affinities.
 447 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
 448 index 41814f6..0f7a78d 100644
 449 --- a/kernel/sched/core.c
 450 +++ b/kernel/sched/core.c
 451 @@ -1224,6 +1224,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 452                 enqueue_task(rq, p, ENQUEUE_RESTORE);
 453  }
 454
 455 +static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
 456 +static DEFINE_MUTEX(sched_down_mutex);
 457 +static cpumask_t sched_down_cpumask;
 458 +
 459 +void tell_sched_cpu_down_begin(int cpu)
 460 +{
 461 +       mutex_lock(&sched_down_mutex);
 462 +       cpumask_set_cpu(cpu, &sched_down_cpumask);
 463 +       mutex_unlock(&sched_down_mutex);
 464 +}
 465 +
 466 +void tell_sched_cpu_down_done(int cpu)
 467 +{
 468 +       mutex_lock(&sched_down_mutex);
 469 +       cpumask_clear_cpu(cpu, &sched_down_cpumask);
 470 +       mutex_unlock(&sched_down_mutex);
 471 +}
 472 +
 473 +/**
 474 + * migrate_me - try to move the current task off this cpu
 475 + *
 476 + * Used by the pin_current_cpu() code to try to get tasks
 477 + * to move off the current CPU as it is going down.
 478 + * It will only move the task if the task isn't pinned to
 479 + * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
 480 + * and the task has to be in a RUNNING state. Otherwise the
 481 + * movement of the task will wake it up (change its state
 482 + * to running) when the task did not expect it.
 483 + *
 484 + * Returns 1 if it succeeded in moving the current task
 485 + *         0 otherwise.
 486 + */
 487 +int migrate_me(void)
 488 +{
 489 +       struct task_struct *p = current;
 490 +       struct migration_arg arg;
 491 +       struct cpumask *cpumask;
 492 +       struct cpumask *mask;
 493 +       unsigned long flags;
 494 +       unsigned int dest_cpu;
 495 +       struct rq *rq;
 496 +
 497 +       /*
 498 +        * We can not migrate tasks bounded to a CPU or tasks not
 499 +        * running. The movement of the task will wake it up.
 500 +        */
 501 +       if (p->flags & PF_NO_SETAFFINITY || p->state)
 502 +               return 0;
 503 +
 504 +       mutex_lock(&sched_down_mutex);
 505 +       rq = task_rq_lock(p, &flags);
 506 +
 507 +       cpumask = this_cpu_ptr(&sched_cpumasks);
 508 +       mask = &p->cpus_allowed;
 509 +
 510 +       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
 511 +
 512 +       if (!cpumask_weight(cpumask)) {
 513 +               /* It's only on this CPU? */
 514 +               task_rq_unlock(rq, p, &flags);
 515 +               mutex_unlock(&sched_down_mutex);
 516 +               return 0;
 517 +       }
 518 +
 519 +       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
 520 +
 521 +       arg.task = p;
 522 +       arg.dest_cpu = dest_cpu;
 523 +
 524 +       task_rq_unlock(rq, p, &flags);
 525 +
 526 +       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 527 +       tlb_migrate_finish(p->mm);
 528 +       mutex_unlock(&sched_down_mutex);
 529 +
 530 +       return 1;
 531 +}
 532 +
 533  /*
 534   * Change a given task's CPU affinity. Migrate the thread to a
 535   * proper CPU and schedule it away if the CPU it's executing on
 536 @@ -3122,7 +3200,7 @@ void migrate_disable(void)
 537  {
 538         struct task_struct *p = current;
 539
 540 -       if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
 541 +       if (in_atomic()) {
 542  #ifdef CONFIG_SCHED_DEBUG
 543                 p->migrate_disable_atomic++;
 544  #endif
 545 @@ -3155,7 +3233,7 @@ void migrate_enable(void)
 546         unsigned long flags;
 547         struct rq *rq;
 548
 549 -       if (in_atomic() || p->flags & PF_NO_SETAFFINITY) {
 550 +       if (in_atomic()) {
 551  #ifdef CONFIG_SCHED_DEBUG
 552                 p->migrate_disable_atomic--;
 553  #endif
 554 --
 555 1.9.1
 556