gpu: nvgpu: fix deadlock on railgate_lock during race condition

author Deepak Nibade <dnibade@nvidia.com>

Thu, 19 Mar 2015 13:29:11 +0000 (18:59 +0530)

committer Deepak Nibade <dnibade@nvidia.com>

Tue, 7 Apr 2015 08:49:32 +0000 (01:49 -0700)
author Deepak Nibade <dnibade@nvidia.com>
Thu, 19 Mar 2015 13:29:11 +0000 (18:59 +0530)
committer Deepak Nibade <dnibade@nvidia.com>
Tue, 7 Apr 2015 08:49:32 +0000 (01:49 -0700)
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c

index 8fcbfa901ae36e8ac2df283fdf7a90ef88cd1ce3..c81087d85e7f12f1a2b09bde2653cd0a92262ff0 100644 (file)
--- a/drivers/gpu/nvgpu/gk20a/gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -1748,6 +1748,10 @@ void gk20a_reset(struct gk20a *g, u32 units)
   * __gk20a_do_idle() - force the GPU to idle and railgate
   *
   * In success, this call MUST be balanced by caller with __gk20a_do_unidle()
+ *
+ * Acquires two locks : &g->busy_lock and &platform->railgate_lock
+ * In success, we hold these locks and return
+ * In failure, we release these locks and return
   */
  int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
  {
@@ -1768,15 +1772,15 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
         if (platform->is_railgated(pdev))
                 return 0;
  
-       /* check if global force_reset flag is set */
-       force_reset |= platform->force_reset_in_do_idle;
-
-       /* prevent suspend by incrementing usage counter */
-       pm_runtime_get_noresume(&pdev->dev);
+       /*
+        * release railgate_lock, prevent suspend by incrementing usage counter,
+        * re-acquire railgate_lock
+        */
+       mutex_unlock(&platform->railgate_lock);
+       pm_runtime_get_sync(&pdev->dev);
+       mutex_lock(&platform->railgate_lock);
  
         /* check and wait until GPU is idle (with a timeout) */
-       pm_runtime_barrier(&pdev->dev);
-
         do {
                 msleep(1);
                 ref_cnt = atomic_read(&pdev->dev.power.usage_count);
@@ -1785,19 +1789,21 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
         if (ref_cnt != 1) {
                 gk20a_err(&pdev->dev, "failed to idle - refcount %d != 1\n",
                         ref_cnt);
-               goto fail;
+               goto fail_drop_usage_count;
         }
  
-       /*
-        * if GPU is now idle, we will have only one ref count
-        * drop this ref which will rail gate the GPU (if GPU
-        * railgate is supported)
-        * if GPU railgate is not supported then we need to
-        * explicitly reset it
-        */
-       pm_runtime_put_sync(&pdev->dev);
+       /* check if global force_reset flag is set */
+       force_reset |= platform->force_reset_in_do_idle;
  
         if (platform->can_railgate && !force_reset) {
+               /*
+                * Case 1 : GPU railgate is supported
+                *
+                * if GPU is now idle, we will have only one ref count,
+                * drop this ref which will rail gate the GPU
+                */
+               pm_runtime_put_sync(&pdev->dev);
+
                 /* add sufficient delay to allow GPU to rail gate */
                 msleep(platform->railgate_delay);
  
@@ -1816,12 +1822,24 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
                         goto fail_timeout;
                 }
         } else {
+               /*
+                * Case 2 : GPU railgate is not supported or we explicitly
+                * do not want to do railgate
+                *
+                * if GPU is now idle, call prepare_poweroff() to save the
+                * state and then assert the reset
+                *
+                * __gk20a_do_unidle() needs to deassert reset, call
+                * finalize_poweron(), and then call pm_runtime_put_sync()
+                * to balance the GPU usage counter
+                */
                 if (!platform->reset_assert || !platform->reset_deassert)
-                       goto fail_timeout;
+                       goto fail_drop_usage_count;
  
-               pm_runtime_get_sync(&pdev->dev);
+               /* Save the GPU state */
                 gk20a_pm_prepare_poweroff(&pdev->dev);
  
+               /* assert GPU reset */
                 platform->reset_assert(pdev);
  
                 udelay(10);
@@ -1830,7 +1848,7 @@ int __gk20a_do_idle(struct platform_device *pdev, bool force_reset)
                 return 0;
         }
  
-fail:
+fail_drop_usage_count:
         pm_runtime_put_noidle(&pdev->dev);
  fail_timeout:
         mutex_unlock(&platform->railgate_lock);
@@ -1866,9 +1884,16 @@ int __gk20a_do_unidle(struct platform_device *pdev)
         struct gk20a_platform *platform = dev_get_drvdata(&pdev->dev);
  
         if (g->forced_reset) {
+               /*
+                * If we did a reset (and not railgate),
+                * then deassert the GPU reset here first
+                */
                 platform->reset_deassert(pdev);
  
+               /* restore the GPU state */
                 gk20a_pm_finalize_poweron(&pdev->dev);
+
+               /* balance GPU usage counter */
                 pm_runtime_put_sync(&pdev->dev);
  
                 g->forced_reset = false;
author	Deepak Nibade <dnibade@nvidia.com>
	Thu, 19 Mar 2015 13:29:11 +0000 (18:59 +0530)
committer	Deepak Nibade <dnibade@nvidia.com>
	Tue, 7 Apr 2015 08:49:32 +0000 (01:49 -0700)