Apply preempt_rt patch-4.9-rt1.patch.xz

[zynq/linux.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 154fd689fe02e910be7abe1102dbac32b5b50a8a..a6aa5801b21ef7a5e28ceb2f6039746d1086bb17 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -129,7 +129,11 @@ const_debug unsigned int sysctl_sched_features =
   * Number of tasks to iterate in a single balance run.
   * Limited because this is done with IRQs disabled.
   */
+#ifndef CONFIG_PREEMPT_RT_FULL
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
+#else
+const_debug unsigned int sysctl_sched_nr_migrate = 8;
+#endif
  
  /*
   * period over which we average the RT time consumption, measured
@@ -345,6 +349,7 @@ static void init_rq_hrtick(struct rq *rq)
  
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
+       rq->hrtick_timer.irqsafe = 1;
  }
  #else  /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@ -449,7 +454,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         head->lastp = &node->next;
  }
  
-void wake_up_q(struct wake_q_head *head)
+void __wake_up_q(struct wake_q_head *head, bool sleeper)
  {
         struct wake_q_node *node = head->first;
  
@@ -466,7 +471,10 @@ void wake_up_q(struct wake_q_head *head)
                  * wake_up_process() implies a wmb() to pair with the queueing
                  * in wake_q_add() so as not to miss wakeups.
                  */
-               wake_up_process(task);
+               if (sleeper)
+                       wake_up_lock_sleeper(task);
+               else
+                       wake_up_process(task);
                 put_task_struct(task);
         }
  }
@@ -502,6 +510,38 @@ void resched_curr(struct rq *rq)
                 trace_sched_wake_idle_without_ipi(cpu);
  }
  
+#ifdef CONFIG_PREEMPT_LAZY
+void resched_curr_lazy(struct rq *rq)
+{
+       struct task_struct *curr = rq->curr;
+       int cpu;
+
+       if (!sched_feat(PREEMPT_LAZY)) {
+               resched_curr(rq);
+               return;
+       }
+
+       lockdep_assert_held(&rq->lock);
+
+       if (test_tsk_need_resched(curr))
+               return;
+
+       if (test_tsk_need_resched_lazy(curr))
+               return;
+
+       set_tsk_need_resched_lazy(curr);
+
+       cpu = cpu_of(rq);
+       if (cpu == smp_processor_id())
+               return;
+
+       /* NEED_RESCHED_LAZY must be visible before we test polling */
+       smp_mb();
+       if (!tsk_is_polling(curr))
+               smp_send_reschedule(cpu);
+}
+#endif
+
  void resched_cpu(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -525,11 +565,14 @@ void resched_cpu(int cpu)
   */
  int get_nohz_timer_target(void)
  {
-       int i, cpu = smp_processor_id();
+       int i, cpu;
         struct sched_domain *sd;
  
+       preempt_disable_rt();
+       cpu = smp_processor_id();
+
         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
-               return cpu;
+               goto preempt_en_rt;
  
         rcu_read_lock();
         for_each_domain(cpu, sd) {
@@ -548,6 +591,8 @@ int get_nohz_timer_target(void)
                 cpu = housekeeping_any_cpu();
  unlock:
         rcu_read_unlock();
+preempt_en_rt:
+       preempt_enable_rt();
         return cpu;
  }
  /*
@@ -1100,6 +1145,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  
         lockdep_assert_held(&p->pi_lock);
  
+       if (__migrate_disabled(p)) {
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               return;
+       }
+
         queued = task_on_rq_queued(p);
         running = task_current(rq, p);
  
@@ -1122,6 +1172,84 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                 set_curr_task(rq, p);
  }
  
+static DEFINE_PER_CPU(struct cpumask, sched_cpumasks);
+static DEFINE_MUTEX(sched_down_mutex);
+static cpumask_t sched_down_cpumask;
+
+void tell_sched_cpu_down_begin(int cpu)
+{
+       mutex_lock(&sched_down_mutex);
+       cpumask_set_cpu(cpu, &sched_down_cpumask);
+       mutex_unlock(&sched_down_mutex);
+}
+
+void tell_sched_cpu_down_done(int cpu)
+{
+       mutex_lock(&sched_down_mutex);
+       cpumask_clear_cpu(cpu, &sched_down_cpumask);
+       mutex_unlock(&sched_down_mutex);
+}
+
+/**
+ * migrate_me - try to move the current task off this cpu
+ *
+ * Used by the pin_current_cpu() code to try to get tasks
+ * to move off the current CPU as it is going down.
+ * It will only move the task if the task isn't pinned to
+ * the CPU (with migrate_disable, affinity or NO_SETAFFINITY)
+ * and the task has to be in a RUNNING state. Otherwise the
+ * movement of the task will wake it up (change its state
+ * to running) when the task did not expect it.
+ *
+ * Returns 1 if it succeeded in moving the current task
+ *         0 otherwise.
+ */
+int migrate_me(void)
+{
+       struct task_struct *p = current;
+       struct migration_arg arg;
+       struct cpumask *cpumask;
+       struct cpumask *mask;
+       unsigned int dest_cpu;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       /*
+        * We can not migrate tasks bounded to a CPU or tasks not
+        * running. The movement of the task will wake it up.
+        */
+       if (p->flags & PF_NO_SETAFFINITY || p->state)
+               return 0;
+
+       mutex_lock(&sched_down_mutex);
+       rq = task_rq_lock(p, &rf);
+
+       cpumask = this_cpu_ptr(&sched_cpumasks);
+       mask = &p->cpus_allowed;
+
+       cpumask_andnot(cpumask, mask, &sched_down_cpumask);
+
+       if (!cpumask_weight(cpumask)) {
+               /* It's only on this CPU? */
+               task_rq_unlock(rq, p, &rf);
+               mutex_unlock(&sched_down_mutex);
+               return 0;
+       }
+
+       dest_cpu = cpumask_any_and(cpu_active_mask, cpumask);
+
+       arg.task = p;
+       arg.dest_cpu = dest_cpu;
+
+       task_rq_unlock(rq, p, &rf);
+
+       stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+       tlb_migrate_finish(p->mm);
+       mutex_unlock(&sched_down_mutex);
+
+       return 1;
+}
+
  /*
   * Change a given task's CPU affinity. Migrate the thread to a
   * proper CPU and schedule it away if the CPU it's executing on
@@ -1179,7 +1307,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
         }
  
         /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpumask_test_cpu(task_cpu(p), new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
                 goto out;
  
         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
@@ -1366,6 +1494,18 @@ out:
         return ret;
  }
  
+static bool check_task_state(struct task_struct *p, long match_state)
+{
+       bool match = false;
+
+       raw_spin_lock_irq(&p->pi_lock);
+       if (p->state == match_state || p->saved_state == match_state)
+               match = true;
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       return match;
+}
+
  /*
   * wait_task_inactive - wait for a thread to unschedule.
   *
@@ -1410,7 +1550,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                  * is actually now running somewhere else!
                  */
                 while (task_running(rq, p)) {
-                       if (match_state && unlikely(p->state != match_state))
+                       if (match_state && !check_task_state(p, match_state))
                                 return 0;
                         cpu_relax();
                 }
@@ -1425,7 +1565,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 running = task_running(rq, p);
                 queued = task_on_rq_queued(p);
                 ncsw = 0;
-               if (!match_state || p->state == match_state)
+               if (!match_state || p->state == match_state ||
+                   p->saved_state == match_state)
                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, p, &rf);
  
@@ -1680,10 +1821,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
  {
         activate_task(rq, p, en_flags);
         p->on_rq = TASK_ON_RQ_QUEUED;
-
-       /* if a worker is waking up, notify workqueue */
-       if (p->flags & PF_WQ_WORKER)
-               wq_worker_waking_up(p, cpu_of(rq));
  }
  
  /*
@@ -2018,8 +2155,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         smp_mb__before_spinlock();
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       if (!(p->state & state))
+       if (!(p->state & state)) {
+               /*
+                * The task might be running due to a spinlock sleeper
+                * wakeup. Check the saved state and set it to running
+                * if the wakeup condition is true.
+                */
+               if (!(wake_flags & WF_LOCK_SLEEPER)) {
+                       if (p->saved_state & state) {
+                               p->saved_state = TASK_RUNNING;
+                               success = 1;
+                       }
+               }
                 goto out;
+       }
+
+       /*
+        * If this is a regular wakeup, then we can unconditionally
+        * clear the saved state of a "lock sleeper".
+        */
+       if (!(wake_flags & WF_LOCK_SLEEPER))
+               p->saved_state = TASK_RUNNING;
  
         trace_sched_waking(p);
  
@@ -2101,53 +2257,6 @@ out:
         return success;
  }
  
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @cookie: context's cookie for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
-{
-       struct rq *rq = task_rq(p);
-
-       if (WARN_ON_ONCE(rq != this_rq()) ||
-           WARN_ON_ONCE(p == current))
-               return;
-
-       lockdep_assert_held(&rq->lock);
-
-       if (!raw_spin_trylock(&p->pi_lock)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we've
-                * not yet picked a replacement task.
-                */
-               lockdep_unpin_lock(&rq->lock, cookie);
-               raw_spin_unlock(&rq->lock);
-               raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
-               lockdep_repin_lock(&rq->lock, cookie);
-       }
-
-       if (!(p->state & TASK_NORMAL))
-               goto out;
-
-       trace_sched_waking(p);
-
-       if (!task_on_rq_queued(p))
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-
-       ttwu_do_wakeup(rq, p, 0, cookie);
-       ttwu_stat(p, smp_processor_id(), 0);
-out:
-       raw_spin_unlock(&p->pi_lock);
-}
-
  /**
   * wake_up_process - Wake up a specific process
   * @p: The process to be woken up.
@@ -2166,6 +2275,18 @@ int wake_up_process(struct task_struct *p)
  }
  EXPORT_SYMBOL(wake_up_process);
  
+/**
+ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock"
+ * @p: The process to be woken up.
+ *
+ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate
+ * the nature of the wakeup.
+ */
+int wake_up_lock_sleeper(struct task_struct *p)
+{
+       return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
+}
+
  int wake_up_state(struct task_struct *p, unsigned int state)
  {
         return try_to_wake_up(p, state, 0);
@@ -2442,6 +2563,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->on_cpu = 0;
  #endif
         init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+       task_thread_info(p)->preempt_lazy_count = 0;
+#endif
  #ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
         RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -2770,21 +2894,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
         finish_arch_post_lock_switch();
  
         fire_sched_in_preempt_notifiers(current);
+       /*
+        * We use mmdrop_delayed() here so we don't have to do the
+        * full __mmdrop() when we are the last user.
+        */
         if (mm)
-               mmdrop(mm);
+               mmdrop_delayed(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
                 if (prev->sched_class->task_dead)
                         prev->sched_class->task_dead(prev);
  
-               /*
-                * Remove function-return probe instances associated with this
-                * task and put them back on the free list.
-                */
-               kprobe_flush_task(prev);
-
-               /* Task is done with its stack. */
-               put_task_stack(prev);
-
                 put_task_struct(prev);
         }
  
@@ -3252,6 +3371,77 @@ static inline void schedule_debug(struct task_struct *prev)
         schedstat_inc(this_rq()->sched_count);
  }
  
+#if defined(CONFIG_PREEMPT_RT_FULL) && defined(CONFIG_SMP)
+
+void migrate_disable(void)
+{
+       struct task_struct *p = current;
+
+       if (in_atomic() || irqs_disabled()) {
+#ifdef CONFIG_SCHED_DEBUG
+               p->migrate_disable_atomic++;
+#endif
+               return;
+       }
+
+#ifdef CONFIG_SCHED_DEBUG
+       if (unlikely(p->migrate_disable_atomic)) {
+               tracing_off();
+               WARN_ON_ONCE(1);
+       }
+#endif
+
+       if (p->migrate_disable) {
+               p->migrate_disable++;
+               return;
+       }
+
+       preempt_disable();
+       preempt_lazy_disable();
+       pin_current_cpu();
+       p->migrate_disable = 1;
+       preempt_enable();
+}
+EXPORT_SYMBOL(migrate_disable);
+
+void migrate_enable(void)
+{
+       struct task_struct *p = current;
+
+       if (in_atomic() || irqs_disabled()) {
+#ifdef CONFIG_SCHED_DEBUG
+               p->migrate_disable_atomic--;
+#endif
+               return;
+       }
+
+#ifdef CONFIG_SCHED_DEBUG
+       if (unlikely(p->migrate_disable_atomic)) {
+               tracing_off();
+               WARN_ON_ONCE(1);
+       }
+#endif
+       WARN_ON_ONCE(p->migrate_disable <= 0);
+
+       if (p->migrate_disable > 1) {
+               p->migrate_disable--;
+               return;
+       }
+
+       preempt_disable();
+       /*
+        * Clearing migrate_disable causes tsk_cpus_allowed to
+        * show the tasks original cpu affinity.
+        */
+       p->migrate_disable = 0;
+
+       unpin_current_cpu();
+       preempt_enable();
+       preempt_lazy_enable();
+}
+EXPORT_SYMBOL(migrate_enable);
+#endif
+
  /*
   * Pick up the highest-prio task:
   */
@@ -3368,19 +3558,6 @@ static void __sched notrace __schedule(bool preempt)
                 } else {
                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
                         prev->on_rq = 0;
-
-                       /*
-                        * If a worker went to sleep, notify and ask workqueue
-                        * whether it wants to wake up a task to maintain
-                        * concurrency.
-                        */
-                       if (prev->flags & PF_WQ_WORKER) {
-                               struct task_struct *to_wakeup;
-
-                               to_wakeup = wq_worker_sleeping(prev);
-                               if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup, cookie);
-                       }
                 }
                 switch_count = &prev->nvcsw;
         }
@@ -3390,6 +3567,7 @@ static void __sched notrace __schedule(bool preempt)
  
         next = pick_next_task(rq, prev, cookie);
         clear_tsk_need_resched(prev);
+       clear_tsk_need_resched_lazy(prev);
         clear_preempt_need_resched();
         rq->clock_skip_update = 0;
  
@@ -3437,8 +3615,19 @@ void __noreturn do_task_dead(void)
  
  static inline void sched_submit_work(struct task_struct *tsk)
  {
-       if (!tsk->state || tsk_is_pi_blocked(tsk))
+       if (!tsk->state)
                 return;
+       /*
+        * If a worker went to sleep, notify and ask workqueue whether
+        * it wants to wake up a task to maintain concurrency.
+        */
+       if (tsk->flags & PF_WQ_WORKER)
+               wq_worker_sleeping(tsk);
+
+
+       if (tsk_is_pi_blocked(tsk))
+               return;
+
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@ -3447,6 +3636,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
                 blk_schedule_flush_plug(tsk);
  }
  
+static void sched_update_worker(struct task_struct *tsk)
+{
+       if (tsk->flags & PF_WQ_WORKER)
+               wq_worker_running(tsk);
+}
+
  asmlinkage __visible void __sched schedule(void)
  {
         struct task_struct *tsk = current;
@@ -3457,6 +3652,7 @@ asmlinkage __visible void __sched schedule(void)
                 __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
+       sched_update_worker(tsk);
  }
  EXPORT_SYMBOL(schedule);
  
@@ -3520,6 +3716,30 @@ static void __sched notrace preempt_schedule_common(void)
         } while (need_resched());
  }
  
+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+       if (test_thread_flag(TIF_NEED_RESCHED))
+               return 1;
+       if (current_thread_info()->preempt_lazy_count)
+               return 0;
+       return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+       return 1;
+}
+
+#endif
+
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@ -3534,7 +3754,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
          */
         if (likely(!preemptible()))
                 return;
-
+       if (!preemptible_lazy())
+               return;
         preempt_schedule_common();
  }
  NOKPROBE_SYMBOL(preempt_schedule);
@@ -3561,6 +3782,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
         if (likely(!preemptible()))
                 return;
  
+       if (!preemptible_lazy())
+               return;
+
         do {
                 /*
                  * Because the function tracer can trace preempt_count_sub()
@@ -3583,7 +3807,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
+               /*
+                * The add/subtract must not be traced by the function
+                * tracer. But we still want to account for the
+                * preempt off latency tracer. Since the _notrace versions
+                * of add/subtract skip the accounting for latency tracer
+                * we must force it manually.
+                */
+               start_critical_timings();
                 __schedule(true);
+               stop_critical_timings();
                 exception_exit(prev_ctx);
  
                 preempt_latency_stop(1);
@@ -4939,6 +5172,7 @@ int __cond_resched_lock(spinlock_t *lock)
  }
  EXPORT_SYMBOL(__cond_resched_lock);
  
+#ifndef CONFIG_PREEMPT_RT_FULL
  int __sched __cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
@@ -4952,6 +5186,7 @@ int __sched __cond_resched_softirq(void)
         return 0;
  }
  EXPORT_SYMBOL(__cond_resched_softirq);
+#endif
  
  /**
   * yield - yield the current processor to other threads.
@@ -5315,7 +5550,9 @@ void init_idle(struct task_struct *idle, int cpu)
  
         /* Set the preempt count _outside_ the spinlocks! */
         init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+       task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
         /*
          * The idle tasks have their own, simple scheduling class:
          */
@@ -5458,6 +5695,8 @@ void sched_setnuma(struct task_struct *p, int nid)
  #endif /* CONFIG_NUMA_BALANCING */
  
  #ifdef CONFIG_HOTPLUG_CPU
+static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm);
+
  /*
   * Ensures that the idle task is using init_mm right before its cpu goes
   * offline.
@@ -5472,7 +5711,12 @@ void idle_task_exit(void)
                 switch_mm_irqs_off(mm, &init_mm, current);
                 finish_arch_post_lock_switch();
         }
-       mmdrop(mm);
+       /*
+        * Defer the cleanup to an alive cpu. On RT we can neither
+        * call mmdrop() nor mmdrop_delayed() from here.
+        */
+       per_cpu(idle_last_mm, smp_processor_id()) = mm;
+
  }
  
  /*
@@ -7418,6 +7662,10 @@ int sched_cpu_dying(unsigned int cpu)
         update_max_interval();
         nohz_balance_exit_idle(cpu);
         hrtick_clear(rq);
+       if (per_cpu(idle_last_mm, cpu)) {
+               mmdrop_delayed(per_cpu(idle_last_mm, cpu));
+               per_cpu(idle_last_mm, cpu) = NULL;
+       }
         return 0;
  }
  #endif
@@ -7698,7 +7946,7 @@ void __init sched_init(void)
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
  static inline int preempt_count_equals(int preempt_offset)
  {
-       int nested = preempt_count() + rcu_preempt_depth();
+       int nested = preempt_count() + sched_rcu_preempt_depth();
  
         return (nested == preempt_offset);
  }