gpu: nvgpu: implement per-channel watchdog

author Deepak Nibade <dnibade@nvidia.com>

Mon, 31 Aug 2015 09:00:35 +0000 (14:30 +0530)

committer mobile promotions <svcmobile_promotions@nvidia.com>

Thu, 5 Nov 2015 07:19:33 +0000 (23:19 -0800)
author Deepak Nibade <dnibade@nvidia.com>
Mon, 31 Aug 2015 09:00:35 +0000 (14:30 +0530)
committer mobile promotions <svcmobile_promotions@nvidia.com>
Thu, 5 Nov 2015 07:19:33 +0000 (23:19 -0800)
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c

index 7fe4be1fa9686b81d873c032b1bdf34a74cef2db..d1242f0e2ccc0de31c7eded4cbde4a73041fa860 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -1460,6 +1460,14 @@ bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
                 ch->timeout_accumulated_ms > ch->timeout_ms_max;
  }
  
+static u32 gk20a_get_channel_watchdog_timeout(struct channel_gk20a *ch)
+{
+       if (ch->g->timeouts_enabled && ch->g->ch_wdt_enabled)
+               return NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS;
+       else
+               return (u32)MAX_SCHEDULE_TIMEOUT;
+}
+
  static u32 get_gp_free_count(struct channel_gk20a *c)
  {
         update_gp_get(c->g, c);
@@ -1543,6 +1551,112 @@ static void trace_write_pushbuffer_range(struct channel_gk20a *c,
                 nvgpu_free(g);
  }
  
+static void gk20a_channel_timeout_start(struct channel_gk20a *ch,
+               struct channel_gk20a_job *job)
+{
+       mutex_lock(&ch->timeout.lock);
+
+       if (ch->timeout.initialized) {
+               mutex_unlock(&ch->timeout.lock);
+               return;
+       }
+
+       ch->timeout.job = job;
+       ch->timeout.initialized = true;
+       schedule_delayed_work(&ch->timeout.wq,
+              msecs_to_jiffies(gk20a_get_channel_watchdog_timeout(ch)));
+
+       mutex_unlock(&ch->timeout.lock);
+}
+
+static void gk20a_channel_timeout_stop(struct channel_gk20a *ch)
+{
+       mutex_lock(&ch->timeout.lock);
+
+       if (!ch->timeout.initialized) {
+               mutex_unlock(&ch->timeout.lock);
+               return;
+       }
+
+       ch->timeout.initialized = false;
+       cancel_delayed_work_sync(&ch->timeout.wq);
+
+       mutex_unlock(&ch->timeout.lock);
+}
+
+static void gk20a_channel_timeout_handler(struct work_struct *work)
+{
+       struct channel_gk20a_job *job;
+       struct gk20a *g;
+       struct channel_gk20a *ch;
+       struct channel_gk20a *failing_ch;
+       u32 engine_id;
+       int id = -1;
+       bool is_tsg = false;
+
+       ch = container_of(to_delayed_work(work), struct channel_gk20a,
+                       timeout.wq);
+       ch = gk20a_channel_get(ch);
+       if (!ch)
+               return;
+
+       g = ch->g;
+
+       /* Need global lock since multiple channels can timeout at a time */
+       mutex_lock(&g->ch_wdt_lock);
+
+       /* Get timed out job and reset the timer */
+       mutex_lock(&ch->timeout.lock);
+       job = ch->timeout.job;
+       ch->timeout.initialized = false;
+       mutex_unlock(&ch->timeout.lock);
+
+       if (gk20a_fifo_disable_all_engine_activity(g, true))
+               goto fail_unlock;
+
+       if (gk20a_fence_is_expired(job->post_fence))
+               goto fail_enable_engine_activity;
+
+       gk20a_err(dev_from_gk20a(g), "Job on channel %d timed out\n",
+               ch->hw_chid);
+
+       /* Get failing engine data */
+       engine_id = gk20a_fifo_get_failing_engine_data(g, &id, &is_tsg);
+
+       if (engine_id >= g->fifo.max_engines) {
+               /* If no failing engine, abort the channels */
+               if (gk20a_is_channel_marked_as_tsg(ch)) {
+                       struct tsg_gk20a *tsg = &g->fifo.tsg[ch->tsgid];
+
+                       gk20a_fifo_set_ctx_mmu_error_tsg(g, tsg);
+                       gk20a_fifo_abort_tsg(g, ch->tsgid);
+               } else {
+                       gk20a_fifo_set_ctx_mmu_error_ch(g, ch);
+                       gk20a_channel_abort(ch);
+               }
+       } else {
+               /* If failing engine, trigger recovery */
+               failing_ch = gk20a_channel_get(&g->fifo.channel[id]);
+               if (!failing_ch)
+                       goto fail_enable_engine_activity;
+
+               if (failing_ch->hw_chid != ch->hw_chid)
+                       gk20a_channel_timeout_start(ch, job);
+
+               gk20a_fifo_recover(g, BIT(engine_id),
+                       failing_ch->hw_chid, is_tsg,
+                       failing_ch->timeout_debug_dump);
+
+               gk20a_channel_put(failing_ch);
+       }
+
+fail_enable_engine_activity:
+       gk20a_fifo_enable_all_engine_activity(g);
+fail_unlock:
+       mutex_unlock(&g->ch_wdt_lock);
+       gk20a_channel_put(ch);
+}
+
  static int gk20a_channel_add_job(struct channel_gk20a *c,
                                  struct gk20a_fence *pre_fence,
                                  struct gk20a_fence *post_fence,
@@ -1581,6 +1695,8 @@ static int gk20a_channel_add_job(struct channel_gk20a *c,
                 job->pre_fence = gk20a_fence_get(pre_fence);
                 job->post_fence = gk20a_fence_get(post_fence);
  
+               gk20a_channel_timeout_start(c, job);
+
                 mutex_lock(&c->jobs_lock);
                 list_add_tail(&job->list, &c->jobs);
                 mutex_unlock(&c->jobs_lock);
@@ -1607,8 +1723,12 @@ void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
                 struct gk20a *g = c->g;
  
                 bool completed = gk20a_fence_is_expired(job->post_fence);
-               if (!completed)
+               if (!completed) {
+                       gk20a_channel_timeout_start(c, job);
                         break;
+               }
+
+               gk20a_channel_timeout_stop(c);
  
                 if (c->sync)
                         c->sync->signal_timeline(c->sync);
@@ -1965,6 +2085,8 @@ int gk20a_init_channel_support(struct gk20a *g, u32 chid)
         mutex_init(&c->ioctl_lock);
         mutex_init(&c->jobs_lock);
         mutex_init(&c->submit_lock);
+       mutex_init(&c->timeout.lock);
+       INIT_DELAYED_WORK(&c->timeout.wq, gk20a_channel_timeout_handler);
         INIT_LIST_HEAD(&c->jobs);
  #if defined(CONFIG_GK20A_CYCLE_STATS)
         mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h

index 8901cf49c46dc7349106053e56b32d8957d9fa4d..fb2b1b0bbfba34727cb2e25c41f1a84d13c19b13 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -38,6 +38,8 @@ struct gk20a_fence;
  #include "gr_gk20a.h"
  #include "fence_gk20a.h"
  
+#define NVGPU_CHANNEL_WATCHDOG_DEFAULT_TIMEOUT_MS      5000
+
  struct gpfifo {
         u32 entry0;
         u32 entry1;
@@ -75,6 +77,13 @@ struct channel_gk20a_job {
         struct list_head list;
  };
  
+struct channel_gk20a_timeout {
+       struct delayed_work wq;
+       struct mutex lock;
+       bool initialized;
+       struct channel_gk20a_job *job;
+};
+
  struct channel_gk20a_poll_events {
         struct mutex lock;
         bool events_enabled;
@@ -132,6 +141,8 @@ struct channel_gk20a {
         u32 timeout_accumulated_ms;
         u32 timeout_gpfifo_get;
  
+       struct channel_gk20a_timeout timeout;
+
         bool cmds_pending;
         struct {
                 /* These fences should be accessed with submit_lock held. */
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c

index fffc0c64e3f58c41998d930b3fa7f628c118c100..5dbbe9f8a92f62ea9391b29a20e6e7f1cb9b533c 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -846,7 +846,7 @@ static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
         return verbose;
  }
  
-static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
                 struct channel_gk20a *ch)
  {
         gk20a_err(dev_from_gk20a(g),
@@ -855,7 +855,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
         return gk20a_fifo_set_ctx_mmu_error(g, ch);
  }
  
-static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
                 struct tsg_gk20a *tsg)
  {
         bool ret = true;
@@ -877,7 +877,7 @@ static bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
         return ret;
  }
  
-static void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
+void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid)
  {
         struct tsg_gk20a *tsg = &g->fifo.tsg[tsgid];
         struct channel_gk20a *ch;
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h

index be2382d65804f50a6d4925ce8d2b21e6980e0c6b..d2002f08e7145b70779c6e4a94e74d283c7b66a1 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -183,5 +183,10 @@ u32 gk20a_fifo_engine_interrupt_mask(struct gk20a *g);
  u32 gk20a_fifo_get_pbdma_signature(struct gk20a *g);
  u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
                 int *__id, bool *__is_tsg);
+bool gk20a_fifo_set_ctx_mmu_error_tsg(struct gk20a *g,
+               struct tsg_gk20a *tsg);
+void gk20a_fifo_abort_tsg(struct gk20a *g, u32 tsgid);
+bool gk20a_fifo_set_ctx_mmu_error_ch(struct gk20a *g,
+               struct channel_gk20a *ch);
  
  #endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c

index 036f253b532147d3d07718d5e279d4f5585d868d..257361241db479689afafa2b98338d41b2d80fdc 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -650,6 +650,7 @@ static int gk20a_init_support(struct platform_device *dev)
  
         mutex_init(&g->dbg_sessions_lock);
         mutex_init(&g->client_lock);
+       mutex_init(&g->ch_wdt_lock);
  
         g->remove_support = gk20a_remove_support;
         return 0;
@@ -1453,6 +1454,7 @@ static int gk20a_probe(struct platform_device *dev)
                         CONFIG_GK20A_DEFAULT_TIMEOUT;
         if (tegra_platform_is_silicon())
                 gk20a->timeouts_enabled = true;
+       gk20a->ch_wdt_enabled = true;
  
         /* Set up initial power settings. For non-slicon platforms, disable *
          * power features and for silicon platforms, read from platform data */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h

index f9874935553320be7e3fc017ee43a0e2c3f81461..709e0af2bee9bdf707e079e6b70cf368bb428b35 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/gk20a.h
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -443,6 +443,9 @@ struct gk20a {
         u32 gr_idle_timeout_default;
         u32 timeouts_enabled;
  
+       u32 ch_wdt_enabled;
+       struct mutex ch_wdt_lock;
+
         bool slcg_enabled;
         bool blcg_enabled;
         bool elcg_enabled;
author	Deepak Nibade <dnibade@nvidia.com>
	Mon, 31 Aug 2015 09:00:35 +0000 (14:30 +0530)
committer	mobile promotions <svcmobile_promotions@nvidia.com>
	Thu, 5 Nov 2015 07:19:33 +0000 (23:19 -0800)
drivers/gpu/nvgpu/gk20a/channel_gk20a.c		patch \| blob \| history
drivers/gpu/nvgpu/gk20a/channel_gk20a.h		patch \| blob \| history
drivers/gpu/nvgpu/gk20a/fifo_gk20a.c		patch \| blob \| history
drivers/gpu/nvgpu/gk20a/fifo_gk20a.h		patch \| blob \| history
drivers/gpu/nvgpu/gk20a/gk20a.c		patch \| blob \| history
drivers/gpu/nvgpu/gk20a/gk20a.h		patch \| blob \| history