]> rtime.felk.cvut.cz Git - linux-imx.git/commitdiff
drm/i915: detect hang using per ring hangcheck_score
authorMika Kuoppala <mika.kuoppala@linux.intel.com>
Thu, 30 May 2013 06:04:29 +0000 (09:04 +0300)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Mon, 3 Jun 2013 08:58:21 +0000 (10:58 +0200)
Keep track of ring seqno progress and if there are no
progress detected, declare hang. Use actual head (acthd)
to distinguish between ring stuck and batchbuffer looping
situation. Stuck ring will be kicked to trigger progress.

This commit adds a hard limit for batchbuffer completion time.
If batchbuffer completion time is more than 4.5 seconds,
the gpu will be declared hung.

Review comment from Ben which nicely clarifies the semantic change:

"Maybe I'm just stating the functional changes of the patch, but in case
they were unintended here is what I see as potential issues:

1. "If ring B is waiting on ring A via semaphore, and ring A is making
   progress, albeit slowly - the hangcheck will fire. The check will
   determine that A is moving, however ring B will appear hung because
   the ACTHD doesn't move. I honestly can't say if that's actually a
   realistic problem to hit it probably implies the timeout value is too
   low.

2. "There's also another corner case on the kick. If the seqno = 2
   (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
   we try to kick it... we don't actually try to find out if the kick
   helped"

v2: use atchd to detect stuck ring from loop (Ben Widawsky)

v3: Use acthd to check when ring needs kicking.
Declare hang on third time in order to give time for
kick_ring to take effect.

v4: Update commit msg

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
[danvet: Paste in Ben's review comment.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 5ae5ca8854d4ee864e367263b8ad08e2974dbee2..e88f173d6b339d08823ccf5f00dd5c8f5761f777 100644 (file)
@@ -683,7 +683,6 @@ static void notify_ring(struct drm_device *dev,
 
        wake_up_all(&ring->irq_queue);
        if (i915_enable_hangcheck) {
-               dev_priv->gpu_error.hangcheck_count = 0;
                mod_timer(&dev_priv->gpu_error.hangcheck_timer,
                          round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
        }
@@ -2422,61 +2421,76 @@ static bool i915_hangcheck_hung(struct drm_device *dev)
 
 /**
  * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. The first time this is called we simply record
- * ACTHD. If ACTHD hasn't changed by the time the hangcheck timer elapses
- * again, we assume the chip is wedged and try to fix it.
+ * batchbuffers in a long time. We keep track per ring seqno progress and
+ * if there are no progress, hangcheck score for that ring is increased.
+ * Further, acthd is inspected to see if the ring is stuck. On stuck case
+ * we kick the ring. If we see no progress on three subsequent calls
+ * we assume chip is wedged and try to fix it by resetting the chip.
  */
 void i915_hangcheck_elapsed(unsigned long data)
 {
        struct drm_device *dev = (struct drm_device *)data;
        drm_i915_private_t *dev_priv = dev->dev_private;
        struct intel_ring_buffer *ring;
-       bool err = false, idle;
        int i;
-       u32 seqno[I915_NUM_RINGS];
-       bool work_done;
+       int busy_count = 0, rings_hung = 0;
+       bool stuck[I915_NUM_RINGS];
 
        if (!i915_enable_hangcheck)
                return;
 
-       idle = true;
        for_each_ring(ring, dev_priv, i) {
-               seqno[i] = ring->get_seqno(ring, false);
-               idle &= i915_hangcheck_ring_idle(ring, seqno[i], &err);
-       }
+               u32 seqno, acthd;
+               bool idle, err = false;
+
+               seqno = ring->get_seqno(ring, false);
+               acthd = intel_ring_get_active_head(ring);
+               idle = i915_hangcheck_ring_idle(ring, seqno, &err);
+               stuck[i] = ring->hangcheck.acthd == acthd;
+
+               if (idle) {
+                       if (err)
+                               ring->hangcheck.score += 2;
+                       else
+                               ring->hangcheck.score = 0;
+               } else {
+                       busy_count++;
 
-       /* If all work is done then ACTHD clearly hasn't advanced. */
-       if (idle) {
-               if (err) {
-                       if (i915_hangcheck_hung(dev))
-                               return;
+                       if (ring->hangcheck.seqno == seqno) {
+                               ring->hangcheck.score++;
 
-                       goto repeat;
+                               /* Kick ring if stuck*/
+                               if (stuck[i])
+                                       i915_hangcheck_ring_hung(ring);
+                       } else {
+                               ring->hangcheck.score = 0;
+                       }
                }
 
-               dev_priv->gpu_error.hangcheck_count = 0;
-               return;
+               ring->hangcheck.seqno = seqno;
+               ring->hangcheck.acthd = acthd;
        }
 
-       work_done = false;
        for_each_ring(ring, dev_priv, i) {
-               if (ring->hangcheck.seqno != seqno[i]) {
-                       work_done = true;
-                       ring->hangcheck.seqno = seqno[i];
+               if (ring->hangcheck.score > 2) {
+                       rings_hung++;
+                       DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
+                                 stuck[i] ? "stuck" : "no progress",
+                                 stuck[i] ? "addr" : "seqno",
+                                 stuck[i] ? ring->hangcheck.acthd & HEAD_ADDR :
+                                 ring->hangcheck.seqno);
                }
        }
 
-       if (!work_done) {
-               if (i915_hangcheck_hung(dev))
-                       return;
-       } else {
-               dev_priv->gpu_error.hangcheck_count = 0;
-       }
+       if (rings_hung)
+               return i915_handle_error(dev, true);
 
-repeat:
-       /* Reset timer case chip hangs without another request being added */
-       mod_timer(&dev_priv->gpu_error.hangcheck_timer,
-                 round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+       if (busy_count)
+               /* Reset timer case chip hangs without another request
+                * being added */
+               mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+                         round_jiffies_up(jiffies +
+                                          DRM_I915_HANGCHECK_JIFFIES));
 }
 
 /* drm_dma.h hooks
index 022d07e43d129871c13dc56d146509d9797aad8d..4c7e103e6fa4c779d2554c2f983cd8075b139865 100644 (file)
@@ -39,6 +39,8 @@ struct  intel_hw_status_page {
 
 struct intel_ring_hangcheck {
        u32 seqno;
+       u32 acthd;
+       int score;
 };
 
 struct  intel_ring_buffer {