drm/i915: Record batch buffer following GPU error
authorChris Wilson <chris@chris-wilson.co.uk>
Thu, 18 Feb 2010 10:24:56 +0000 (10:24 +0000)
committerEric Anholt <eric@anholt.net>
Mon, 22 Feb 2010 17:01:39 +0000 (12:01 -0500)
In order to improve our diagnostic capabilities following a GPU hang
and subsequent reset, we need to record the batch buffer that triggered
the error. We assume that the current batch buffer, plus a few details
about what else is on the active list, will be sufficient -- at the very
least an improvement over nothing.

The extra information is stored in /debug/dri/.../i915_error_state
following an error, and may be decoded using
intel_gpu_tools/tools/intel_error_decode.

v2: Avoid excessive work under spinlocks.
v3: Include ringbuffer for later analysis.
v4: Use kunmap correctly and record more buffer state.
v5: Search ringbuffer for current batch buffer
v6: Use a work fn for the impossible IRQ error case.
v7: Avoid non-atomic paths whilst in IRQ context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_dma.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/i915_reg.h

index 56095b3..5eed463 100644 (file)
@@ -350,6 +350,36 @@ static int i915_ringbuffer_info(struct seq_file *m, void *data)
        return 0;
 }
 
+static const char *pin_flag(int pinned)
+{
+       if (pinned > 0)
+               return " P";
+       else if (pinned < 0)
+               return " p";
+       else
+               return "";
+}
+
+static const char *tiling_flag(int tiling)
+{
+       switch (tiling) {
+       default:
+       case I915_TILING_NONE: return "";
+       case I915_TILING_X: return " X";
+       case I915_TILING_Y: return " Y";
+       }
+}
+
+static const char *dirty_flag(int dirty)
+{
+       return dirty ? " dirty" : "";
+}
+
+static const char *purgeable_flag(int purgeable)
+{
+       return purgeable ? " purgeable" : "";
+}
+
 static int i915_error_state(struct seq_file *m, void *unused)
 {
        struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -357,6 +387,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
        drm_i915_private_t *dev_priv = dev->dev_private;
        struct drm_i915_error_state *error;
        unsigned long flags;
+       int i, page, offset, elt;
 
        spin_lock_irqsave(&dev_priv->error_lock, flags);
        if (!dev_priv->first_error) {
@@ -368,6 +399,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
 
        seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
                   error->time.tv_usec);
+       seq_printf(m, "PCI ID: 0x%04x\n", dev->pci_device);
        seq_printf(m, "EIR: 0x%08x\n", error->eir);
        seq_printf(m, "  PGTBL_ER: 0x%08x\n", error->pgtbl_er);
        seq_printf(m, "  INSTPM: 0x%08x\n", error->instpm);
@@ -379,6 +411,59 @@ static int i915_error_state(struct seq_file *m, void *unused)
                seq_printf(m, "  INSTPS: 0x%08x\n", error->instps);
                seq_printf(m, "  INSTDONE1: 0x%08x\n", error->instdone1);
        }
+       seq_printf(m, "seqno: 0x%08x\n", error->seqno);
+
+       if (error->active_bo_count) {
+               seq_printf(m, "Buffers [%d]:\n", error->active_bo_count);
+
+               for (i = 0; i < error->active_bo_count; i++) {
+                       seq_printf(m, "  %08x %8zd %08x %08x %08x%s%s%s%s",
+                                  error->active_bo[i].gtt_offset,
+                                  error->active_bo[i].size,
+                                  error->active_bo[i].read_domains,
+                                  error->active_bo[i].write_domain,
+                                  error->active_bo[i].seqno,
+                                  pin_flag(error->active_bo[i].pinned),
+                                  tiling_flag(error->active_bo[i].tiling),
+                                  dirty_flag(error->active_bo[i].dirty),
+                                  purgeable_flag(error->active_bo[i].purgeable));
+
+                       if (error->active_bo[i].name)
+                               seq_printf(m, " (name: %d)", error->active_bo[i].name);
+                       if (error->active_bo[i].fence_reg != I915_FENCE_REG_NONE)
+                               seq_printf(m, " (fence: %d)", error->active_bo[i].fence_reg);
+
+                       seq_printf(m, "\n");
+               }
+       }
+
+       for (i = 0; i < ARRAY_SIZE(error->batchbuffer); i++) {
+               if (error->batchbuffer[i]) {
+                       struct drm_i915_error_object *obj = error->batchbuffer[i];
+
+                       seq_printf(m, "--- gtt_offset = 0x%08x\n", obj->gtt_offset);
+                       offset = 0;
+                       for (page = 0; page < obj->page_count; page++) {
+                               for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+                                       seq_printf(m, "%08x :  %08x\n", offset, obj->pages[page][elt]);
+                                       offset += 4;
+                               }
+                       }
+               }
+       }
+
+       if (error->ringbuffer) {
+               struct drm_i915_error_object *obj = error->ringbuffer;
+
+               seq_printf(m, "--- ringbuffer = 0x%08x\n", obj->gtt_offset);
+               offset = 0;
+               for (page = 0; page < obj->page_count; page++) {
+                       for (elt = 0; elt < PAGE_SIZE/4; elt++) {
+                               seq_printf(m, "%08x :  %08x\n", offset, obj->pages[page][elt]);
+                               offset += 4;
+                       }
+               }
+       }
 
 out:
        spin_unlock_irqrestore(&dev_priv->error_lock, flags);
index 47805a4..dbfe07c 100644 (file)
@@ -1644,6 +1644,8 @@ int i915_driver_unload(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
 
+       i915_destroy_error_state(dev);
+
        destroy_workqueue(dev_priv->wq);
        del_timer_sync(&dev_priv->hangcheck_timer);
 
index 40b0da3..ec06d48 100644 (file)
@@ -150,7 +150,27 @@ struct drm_i915_error_state {
        u32 instps;
        u32 instdone1;
        u32 seqno;
+       u64 bbaddr;
        struct timeval time;
+       struct drm_i915_error_object {
+               int page_count;
+               u32 gtt_offset;
+               u32 *pages[0];
+       } *ringbuffer, *batchbuffer[2];
+       struct drm_i915_error_buffer {
+               size_t size;
+               u32 name;
+               u32 seqno;
+               u32 gtt_offset;
+               u32 read_domains;
+               u32 write_domain;
+               u32 fence_reg;
+               s32 pinned:2;
+               u32 tiling:2;
+               u32 dirty:1;
+               u32 purgeable:1;
+       } *active_bo;
+       u32 active_bo_count;
 };
 
 struct drm_i915_display_funcs {
@@ -778,6 +798,7 @@ extern int i965_reset(struct drm_device *dev, u8 flags);
 
 /* i915_irq.c */
 void i915_hangcheck_elapsed(unsigned long data);
+void i915_destroy_error_state(struct drm_device *dev);
 extern int i915_irq_emit(struct drm_device *dev, void *data,
                         struct drm_file *file_priv);
 extern int i915_irq_wait(struct drm_device *dev, void *data,
index 1a56ae7..ba1d831 100644 (file)
@@ -432,6 +432,121 @@ static void i915_error_work_func(struct work_struct *work)
        }
 }
 
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_device *dev,
+                        struct drm_gem_object *src)
+{
+       struct drm_i915_error_object *dst;
+       struct drm_i915_gem_object *src_priv;
+       int page, page_count;
+
+       if (src == NULL)
+               return NULL;
+
+       src_priv = src->driver_private;
+       if (src_priv->pages == NULL)
+               return NULL;
+
+       page_count = src->size / PAGE_SIZE;
+
+       dst = kmalloc(sizeof(*dst) + page_count * sizeof (u32 *), GFP_ATOMIC);
+       if (dst == NULL)
+               return NULL;
+
+       for (page = 0; page < page_count; page++) {
+               void *s, *d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+               if (d == NULL)
+                       goto unwind;
+               s = kmap_atomic(src_priv->pages[page], KM_USER0);
+               memcpy(d, s, PAGE_SIZE);
+               kunmap_atomic(s, KM_USER0);
+               dst->pages[page] = d;
+       }
+       dst->page_count = page_count;
+       dst->gtt_offset = src_priv->gtt_offset;
+
+       return dst;
+
+unwind:
+       while (page--)
+               kfree(dst->pages[page]);
+       kfree(dst);
+       return NULL;
+}
+
+static void
+i915_error_object_free(struct drm_i915_error_object *obj)
+{
+       int page;
+
+       if (obj == NULL)
+               return;
+
+       for (page = 0; page < obj->page_count; page++)
+               kfree(obj->pages[page]);
+
+       kfree(obj);
+}
+
+static void
+i915_error_state_free(struct drm_device *dev,
+                     struct drm_i915_error_state *error)
+{
+       i915_error_object_free(error->batchbuffer[0]);
+       i915_error_object_free(error->batchbuffer[1]);
+       i915_error_object_free(error->ringbuffer);
+       kfree(error->active_bo);
+       kfree(error);
+}
+
+static u32
+i915_get_bbaddr(struct drm_device *dev, u32 *ring)
+{
+       u32 cmd;
+
+       if (IS_I830(dev) || IS_845G(dev))
+               cmd = MI_BATCH_BUFFER;
+       else if (IS_I965G(dev))
+               cmd = (MI_BATCH_BUFFER_START | (2 << 6) |
+                      MI_BATCH_NON_SECURE_I965);
+       else
+               cmd = (MI_BATCH_BUFFER_START | (2 << 6));
+
+       return ring[0] == cmd ? ring[1] : 0;
+}
+
+static u32
+i915_ringbuffer_last_batch(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       u32 head, bbaddr;
+       u32 *ring;
+
+       /* Locate the current position in the ringbuffer and walk back
+        * to find the most recently dispatched batch buffer.
+        */
+       bbaddr = 0;
+       head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
+       ring = (u32 *)(dev_priv->ring.virtual_start + head);
+
+       while (--ring >= (u32 *)dev_priv->ring.virtual_start) {
+               bbaddr = i915_get_bbaddr(dev, ring);
+               if (bbaddr)
+                       break;
+       }
+
+       if (bbaddr == 0) {
+               ring = (u32 *)(dev_priv->ring.virtual_start + dev_priv->ring.Size);
+               while (--ring >= (u32 *)dev_priv->ring.virtual_start) {
+                       bbaddr = i915_get_bbaddr(dev, ring);
+                       if (bbaddr)
+                               break;
+               }
+       }
+
+       return bbaddr;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -444,19 +559,26 @@ static void i915_error_work_func(struct work_struct *work)
 static void i915_capture_error_state(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       struct drm_i915_gem_object *obj_priv;
        struct drm_i915_error_state *error;
+       struct drm_gem_object *batchbuffer[2];
        unsigned long flags;
+       u32 bbaddr;
+       int count;
 
        spin_lock_irqsave(&dev_priv->error_lock, flags);
-       if (dev_priv->first_error)
-               goto out;
+       error = dev_priv->first_error;
+       spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+       if (error)
+               return;
 
        error = kmalloc(sizeof(*error), GFP_ATOMIC);
        if (!error) {
-               DRM_DEBUG_DRIVER("out ot memory, not capturing error state\n");
-               goto out;
+               DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
+               return;
        }
 
+       error->seqno = i915_get_gem_seqno(dev);
        error->eir = I915_READ(EIR);
        error->pgtbl_er = I915_READ(PGTBL_ER);
        error->pipeastat = I915_READ(PIPEASTAT);
@@ -467,6 +589,7 @@ static void i915_capture_error_state(struct drm_device *dev)
                error->ipehr = I915_READ(IPEHR);
                error->instdone = I915_READ(INSTDONE);
                error->acthd = I915_READ(ACTHD);
+               error->bbaddr = 0;
        } else {
                error->ipeir = I915_READ(IPEIR_I965);
                error->ipehr = I915_READ(IPEHR_I965);
@@ -474,14 +597,101 @@ static void i915_capture_error_state(struct drm_device *dev)
                error->instps = I915_READ(INSTPS);
                error->instdone1 = I915_READ(INSTDONE1);
                error->acthd = I915_READ(ACTHD_I965);
+               error->bbaddr = I915_READ64(BB_ADDR);
        }
 
-       do_gettimeofday(&error->time);
+       bbaddr = i915_ringbuffer_last_batch(dev);
+
+       /* Grab the current batchbuffer, most likely to have crashed. */
+       batchbuffer[0] = NULL;
+       batchbuffer[1] = NULL;
+       count = 0;
+       list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+               struct drm_gem_object *obj = obj_priv->obj;
+
+               if (batchbuffer[0] == NULL &&
+                   bbaddr >= obj_priv->gtt_offset &&
+                   bbaddr < obj_priv->gtt_offset + obj->size)
+                       batchbuffer[0] = obj;
+
+               if (batchbuffer[1] == NULL &&
+                   error->acthd >= obj_priv->gtt_offset &&
+                   error->acthd < obj_priv->gtt_offset + obj->size &&
+                   batchbuffer[0] != obj)
+                       batchbuffer[1] = obj;
+
+               count++;
+       }
 
-       dev_priv->first_error = error;
+       /* We need to copy these to an anonymous buffer as the simplest
+        * method to avoid being overwritten by userpace.
+        */
+       error->batchbuffer[0] = i915_error_object_create(dev, batchbuffer[0]);
+       error->batchbuffer[1] = i915_error_object_create(dev, batchbuffer[1]);
+
+       /* Record the ringbuffer */
+       error->ringbuffer = i915_error_object_create(dev, dev_priv->ring.ring_obj);
+
+       /* Record buffers on the active list. */
+       error->active_bo = NULL;
+       error->active_bo_count = 0;
+
+       if (count)
+               error->active_bo = kmalloc(sizeof(*error->active_bo)*count,
+                                          GFP_ATOMIC);
+
+       if (error->active_bo) {
+               int i = 0;
+               list_for_each_entry(obj_priv, &dev_priv->mm.active_list, list) {
+                       struct drm_gem_object *obj = obj_priv->obj;
+
+                       error->active_bo[i].size = obj->size;
+                       error->active_bo[i].name = obj->name;
+                       error->active_bo[i].seqno = obj_priv->last_rendering_seqno;
+                       error->active_bo[i].gtt_offset = obj_priv->gtt_offset;
+                       error->active_bo[i].read_domains = obj->read_domains;
+                       error->active_bo[i].write_domain = obj->write_domain;
+                       error->active_bo[i].fence_reg = obj_priv->fence_reg;
+                       error->active_bo[i].pinned = 0;
+                       if (obj_priv->pin_count > 0)
+                               error->active_bo[i].pinned = 1;
+                       if (obj_priv->user_pin_count > 0)
+                               error->active_bo[i].pinned = -1;
+                       error->active_bo[i].tiling = obj_priv->tiling_mode;
+                       error->active_bo[i].dirty = obj_priv->dirty;
+                       error->active_bo[i].purgeable = obj_priv->madv != I915_MADV_WILLNEED;
+
+                       if (++i == count)
+                               break;
+               }
+               error->active_bo_count = i;
+       }
+
+       do_gettimeofday(&error->time);
 
-out:
+       spin_lock_irqsave(&dev_priv->error_lock, flags);
+       if (dev_priv->first_error == NULL) {
+               dev_priv->first_error = error;
+               error = NULL;
+       }
        spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+       if (error)
+               i915_error_state_free(dev, error);
+}
+
+void i915_destroy_error_state(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct drm_i915_error_state *error;
+
+       spin_lock(&dev_priv->error_lock);
+       error = dev_priv->first_error;
+       dev_priv->first_error = NULL;
+       spin_unlock(&dev_priv->error_lock);
+
+       if (error)
+               i915_error_state_free(dev, error);
 }
 
 /**
index d344c03..eff8d85 100644 (file)
 #define   CM0_COLOR_EVICT_DISABLE (1<<3)
 #define   CM0_DEPTH_WRITE_DISABLE (1<<1)
 #define   CM0_RC_OP_FLUSH_DISABLE (1<<0)
+#define BB_ADDR                0x02140 /* 8 bytes */
 #define GFX_FLSH_CNTL  0x02170 /* 915+ only */