diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.19/950-0609-drm-v3d-Add-support-for-compute-shader-dispatch.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.19/950-0609-drm-v3d-Add-support-for-compute-shader-dispatch.patch | 897 |
1 files changed, 0 insertions, 897 deletions
diff --git a/target/linux/brcm2708/patches-4.19/950-0609-drm-v3d-Add-support-for-compute-shader-dispatch.patch b/target/linux/brcm2708/patches-4.19/950-0609-drm-v3d-Add-support-for-compute-shader-dispatch.patch deleted file mode 100644 index dc230621dc..0000000000 --- a/target/linux/brcm2708/patches-4.19/950-0609-drm-v3d-Add-support-for-compute-shader-dispatch.patch +++ /dev/null @@ -1,897 +0,0 @@ -From 22dbf1420a552d1952d22b92d8c30f8162b026b5 Mon Sep 17 00:00:00 2001 -From: Eric Anholt <eric@anholt.net> -Date: Tue, 16 Apr 2019 15:58:54 -0700 -Subject: [PATCH 609/806] drm/v3d: Add support for compute shader dispatch. - -The compute shader dispatch interface is pretty simple -- just pass in -the regs that userspace has passed us, with no CLs to run. However, -with no CL to run it means that we need to do manual cache flushing of -the L2 after the HW execution completes (for SSBO, atomic, and -image_load_store writes that are the output of compute shaders). - -This doesn't yet expose the L2 cache's ability to have a region of the -address space not write back to memory (which could be used for -shared_var storage). - -So far, the Mesa side has been tested on V3D v4.2 simpenrose (passing -the ES31 tests), and on the kernel side on 7278 (failing atomic -compswap tests in a way that doesn't reproduce on simpenrose). - -v2: Fix excessive allocation for the clean_job (reported by Dan - Carpenter). Keep refs on jobs until clean_job is finished, to - avoid spurious MMU errors if the output BOs are freed by userspace - before L2 cleaning is finished. - -Signed-off-by: Eric Anholt <eric@anholt.net> -Link: https://patchwork.freedesktop.org/patch/msgid/20190416225856.20264-4-eric@anholt.net -Acked-by: Rob Clark <robdclark@gmail.com> ---- - drivers/gpu/drm/v3d/v3d_debugfs.c | 22 +++++ - drivers/gpu/drm/v3d/v3d_drv.c | 10 +- - drivers/gpu/drm/v3d/v3d_drv.h | 28 +++++- - drivers/gpu/drm/v3d/v3d_fence.c | 2 + - drivers/gpu/drm/v3d/v3d_gem.c | 156 +++++++++++++++++++++++++++++- - drivers/gpu/drm/v3d/v3d_irq.c | 16 ++- - drivers/gpu/drm/v3d/v3d_regs.h | 73 ++++++++++++++ - drivers/gpu/drm/v3d/v3d_sched.c | 121 +++++++++++++++++++++-- - drivers/gpu/drm/v3d/v3d_trace.h | 94 ++++++++++++++++++ - include/uapi/drm/v3d_drm.h | 28 ++++++ - 10 files changed, 531 insertions(+), 19 deletions(-) - ---- a/drivers/gpu/drm/v3d/v3d_debugfs.c -+++ b/drivers/gpu/drm/v3d/v3d_debugfs.c -@@ -57,6 +57,17 @@ static const struct v3d_reg_def v3d_core - REGDEF(V3D_GMP_VIO_ADDR), - }; - -+static const struct v3d_reg_def v3d_csd_reg_defs[] = { -+ REGDEF(V3D_CSD_STATUS), -+ REGDEF(V3D_CSD_CURRENT_CFG0), -+ REGDEF(V3D_CSD_CURRENT_CFG1), -+ REGDEF(V3D_CSD_CURRENT_CFG2), -+ REGDEF(V3D_CSD_CURRENT_CFG3), -+ REGDEF(V3D_CSD_CURRENT_CFG4), -+ REGDEF(V3D_CSD_CURRENT_CFG5), -+ REGDEF(V3D_CSD_CURRENT_CFG6), -+}; -+ - static int v3d_v3d_debugfs_regs(struct seq_file *m, void *unused) - { - struct drm_info_node *node = (struct drm_info_node *)m->private; -@@ -88,6 +99,17 @@ static int v3d_v3d_debugfs_regs(struct s - V3D_CORE_READ(core, - v3d_core_reg_defs[i].reg)); - } -+ -+ if (v3d_has_csd(v3d)) { -+ for (i = 0; i < ARRAY_SIZE(v3d_csd_reg_defs); i++) { -+ seq_printf(m, "core %d %s (0x%04x): 0x%08x\n", -+ core, -+ v3d_csd_reg_defs[i].name, -+ v3d_csd_reg_defs[i].reg, -+ V3D_CORE_READ(core, -+ v3d_csd_reg_defs[i].reg)); -+ } -+ } - } - - return 0; ---- a/drivers/gpu/drm/v3d/v3d_drv.c -+++ b/drivers/gpu/drm/v3d/v3d_drv.c -@@ -7,9 +7,9 @@ - * This driver supports the Broadcom V3D 3.3 and 4.1 OpenGL ES GPUs. - * For V3D 2.x support, see the VC4 driver. - * -- * Currently only single-core rendering using the binner and renderer, -- * along with TFU (texture formatting unit) rendering is supported. -- * V3D 4.x's CSD (compute shader dispatch) is not yet supported. -+ * The V3D GPU includes a tiled render (composed of a bin and render -+ * pipelines), the TFU (texture formatting unit), and the CSD (compute -+ * shader dispatch). - */ - - #include <linux/clk.h> -@@ -114,6 +114,9 @@ static int v3d_get_param_ioctl(struct dr - case DRM_V3D_PARAM_SUPPORTS_TFU: - args->value = 1; - return 0; -+ case DRM_V3D_PARAM_SUPPORTS_CSD: -+ args->value = v3d_has_csd(v3d); -+ return 0; - default: - DRM_DEBUG("Unknown parameter %d\n", args->param); - return -EINVAL; -@@ -183,6 +186,7 @@ static const struct drm_ioctl_desc v3d_d - DRM_IOCTL_DEF_DRV(V3D_GET_PARAM, v3d_get_param_ioctl, DRM_RENDER_ALLOW), - DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW), - DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), -+ DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH), - }; - - static const struct vm_operations_struct v3d_vm_ops = { ---- a/drivers/gpu/drm/v3d/v3d_drv.h -+++ b/drivers/gpu/drm/v3d/v3d_drv.h -@@ -16,9 +16,11 @@ enum v3d_queue { - V3D_BIN, - V3D_RENDER, - V3D_TFU, -+ V3D_CSD, -+ V3D_CACHE_CLEAN, - }; - --#define V3D_MAX_QUEUES (V3D_TFU + 1) -+#define V3D_MAX_QUEUES (V3D_CACHE_CLEAN + 1) - - struct v3d_queue_state { - struct drm_gpu_scheduler sched; -@@ -70,6 +72,7 @@ struct v3d_dev { - struct v3d_bin_job *bin_job; - struct v3d_render_job *render_job; - struct v3d_tfu_job *tfu_job; -+ struct v3d_csd_job *csd_job; - - struct v3d_queue_state queue[V3D_MAX_QUEUES]; - -@@ -92,6 +95,12 @@ struct v3d_dev { - */ - struct mutex sched_lock; - -+ /* Lock taken during a cache clean and when initiating an L2 -+ * flush, to keep L2 flushes from interfering with the -+ * synchronous L2 cleans. -+ */ -+ struct mutex cache_clean_lock; -+ - struct { - u32 num_allocated; - u32 pages_allocated; -@@ -104,6 +113,12 @@ to_v3d_dev(struct drm_device *dev) - return (struct v3d_dev *)dev->dev_private; - } - -+static inline bool -+v3d_has_csd(struct v3d_dev *v3d) -+{ -+ return v3d->ver >= 41; -+} -+ - /* The per-fd struct, which tracks the MMU mappings. */ - struct v3d_file_priv { - struct v3d_dev *v3d; -@@ -237,6 +252,14 @@ struct v3d_tfu_job { - struct drm_v3d_submit_tfu args; - }; - -+struct v3d_csd_job { -+ struct v3d_job base; -+ -+ u32 timedout_batches; -+ -+ struct drm_v3d_submit_csd args; -+}; -+ - /** - * _wait_for - magic (register) wait macro - * -@@ -302,11 +325,14 @@ int v3d_submit_cl_ioctl(struct drm_devic - struct drm_file *file_priv); - int v3d_submit_tfu_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv); -+int v3d_submit_csd_ioctl(struct drm_device *dev, void *data, -+ struct drm_file *file_priv); - int v3d_wait_bo_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv); - void v3d_job_put(struct v3d_job *job); - void v3d_reset(struct v3d_dev *v3d); - void v3d_invalidate_caches(struct v3d_dev *v3d); -+void v3d_clean_caches(struct v3d_dev *v3d); - - /* v3d_irq.c */ - int v3d_irq_init(struct v3d_dev *v3d); ---- a/drivers/gpu/drm/v3d/v3d_fence.c -+++ b/drivers/gpu/drm/v3d/v3d_fence.c -@@ -36,6 +36,8 @@ static const char *v3d_fence_get_timelin - return "v3d-render"; - case V3D_TFU: - return "v3d-tfu"; -+ case V3D_CSD: -+ return "v3d-csd"; - default: - return NULL; - } ---- a/drivers/gpu/drm/v3d/v3d_gem.c -+++ b/drivers/gpu/drm/v3d/v3d_gem.c -@@ -162,10 +162,52 @@ v3d_flush_l2t(struct v3d_dev *v3d, int c - /* While there is a busy bit (V3D_L2TCACTL_L2TFLS), we don't - * need to wait for completion before dispatching the job -- - * L2T accesses will be stalled until the flush has completed. -+ * However, we do need to make sure we don't try to trigger a -+ * new flush while the L2_CLEAN queue is trying to -+ * synchronously clean after a job. - */ -+ mutex_lock(&v3d->cache_clean_lock); - V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, - V3D_L2TCACTL_L2TFLS | - V3D_SET_FIELD(V3D_L2TCACTL_FLM_FLUSH, V3D_L2TCACTL_FLM)); -+ mutex_unlock(&v3d->cache_clean_lock); -+} -+ -+/* Cleans texture L1 and L2 cachelines (writing back dirty data). -+ * -+ * For cleaning, which happens from the CACHE_CLEAN queue after CSD has -+ * executed, we need to make sure that the clean is done before -+ * signaling job completion. So, we synchronously wait before -+ * returning, and we make sure that L2 invalidates don't happen in the -+ * meantime to confuse our are-we-done checks. -+ */ -+void -+v3d_clean_caches(struct v3d_dev *v3d) -+{ -+ struct drm_device *dev = &v3d->drm; -+ int core = 0; -+ -+ trace_v3d_cache_clean_begin(dev); -+ -+ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, V3D_L2TCACTL_TMUWCF); -+ if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & -+ V3D_L2TCACTL_L2TFLS), 100)) { -+ DRM_ERROR("Timeout waiting for L1T write combiner flush\n"); -+ } -+ -+ mutex_lock(&v3d->cache_clean_lock); -+ V3D_CORE_WRITE(core, V3D_CTL_L2TCACTL, -+ V3D_L2TCACTL_L2TFLS | -+ V3D_SET_FIELD(V3D_L2TCACTL_FLM_CLEAN, V3D_L2TCACTL_FLM)); -+ -+ if (wait_for(!(V3D_CORE_READ(core, V3D_CTL_L2TCACTL) & -+ V3D_L2TCACTL_L2TFLS), 100)) { -+ DRM_ERROR("Timeout waiting for L2T clean\n"); -+ } -+ -+ mutex_unlock(&v3d->cache_clean_lock); -+ -+ trace_v3d_cache_clean_end(dev); - } - - /* Invalidates the slice caches. These are read-only caches. */ -@@ -584,7 +626,8 @@ static void - v3d_attach_fences_and_unlock_reservation(struct drm_file *file_priv, - struct v3d_job *job, - struct ww_acquire_ctx *acquire_ctx, -- u32 out_sync) -+ u32 out_sync, -+ struct dma_fence *done_fence) - { - struct drm_syncobj *sync_out; - -@@ -594,7 +637,7 @@ v3d_attach_fences_and_unlock_reservation - /* Update the return sync object for the job */ - sync_out = drm_syncobj_find(file_priv, out_sync); - if (sync_out) { -- drm_syncobj_replace_fence(sync_out, job->done_fence); -+ drm_syncobj_replace_fence(sync_out, done_fence); - drm_syncobj_put(sync_out); - } - } -@@ -691,8 +734,10 @@ v3d_submit_cl_ioctl(struct drm_device *d - mutex_unlock(&v3d->sched_lock); - - v3d_attach_fences_and_unlock_reservation(file_priv, -- &render->base, &acquire_ctx, -- args->out_sync); -+ &render->base, -+ &acquire_ctx, -+ args->out_sync, -+ render->base.done_fence); - - if (bin) - v3d_job_put(&bin->base); -@@ -785,7 +830,8 @@ v3d_submit_tfu_ioctl(struct drm_device * - - v3d_attach_fences_and_unlock_reservation(file_priv, - &job->base, &acquire_ctx, -- args->out_sync); -+ args->out_sync, -+ job->base.done_fence); - - v3d_job_put(&job->base); - -@@ -801,6 +847,105 @@ fail: - return ret; - } - -+/** -+ * v3d_submit_csd_ioctl() - Submits a CSD (texture formatting) job to the V3D. -+ * @dev: DRM device -+ * @data: ioctl argument -+ * @file_priv: DRM file for this fd -+ * -+ * Userspace provides the register setup for the CSD, which we don't -+ * need to validate since the CSD is behind the MMU. -+ */ -+int -+v3d_submit_csd_ioctl(struct drm_device *dev, void *data, -+ struct drm_file *file_priv) -+{ -+ struct v3d_dev *v3d = to_v3d_dev(dev); -+ struct v3d_file_priv *v3d_priv = file_priv->driver_priv; -+ struct drm_v3d_submit_csd *args = data; -+ struct v3d_csd_job *job; -+ struct v3d_job *clean_job; -+ struct ww_acquire_ctx acquire_ctx; -+ int ret; -+ -+ trace_v3d_submit_csd_ioctl(&v3d->drm, args->cfg[5], args->cfg[6]); -+ -+ if (!v3d_has_csd(v3d)) { -+ DRM_DEBUG("Attempting CSD submit on non-CSD hardware\n"); -+ return -EINVAL; -+ } -+ -+ job = kcalloc(1, sizeof(*job), GFP_KERNEL); -+ if (!job) -+ return -ENOMEM; -+ -+ ret = v3d_job_init(v3d, file_priv, &job->base, -+ v3d_job_free, args->in_sync); -+ if (ret) { -+ kfree(job); -+ return ret; -+ } -+ -+ clean_job = kcalloc(1, sizeof(*clean_job), GFP_KERNEL); -+ if (!clean_job) { -+ v3d_job_put(&job->base); -+ kfree(job); -+ return -ENOMEM; -+ } -+ -+ ret = v3d_job_init(v3d, file_priv, clean_job, v3d_job_free, 0); -+ if (ret) { -+ v3d_job_put(&job->base); -+ kfree(clean_job); -+ return ret; -+ } -+ -+ job->args = *args; -+ -+ ret = v3d_lookup_bos(dev, file_priv, clean_job, -+ args->bo_handles, args->bo_handle_count); -+ if (ret) -+ goto fail; -+ -+ ret = v3d_lock_bo_reservations(clean_job, &acquire_ctx); -+ if (ret) -+ goto fail; -+ -+ mutex_lock(&v3d->sched_lock); -+ ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD); -+ if (ret) -+ goto fail_unreserve; -+ -+ ret = v3d_add_dep(clean_job, dma_fence_get(job->base.done_fence)); -+ if (ret) -+ goto fail_unreserve; -+ ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN); -+ if (ret) -+ goto fail_unreserve; -+ mutex_unlock(&v3d->sched_lock); -+ -+ v3d_attach_fences_and_unlock_reservation(file_priv, -+ clean_job, -+ &acquire_ctx, -+ args->out_sync, -+ clean_job->done_fence); -+ -+ v3d_job_put(&job->base); -+ v3d_job_put(clean_job); -+ -+ return 0; -+ -+fail_unreserve: -+ mutex_unlock(&v3d->sched_lock); -+ v3d_unlock_bo_reservations(clean_job->bo, clean_job->bo_count, -+ &acquire_ctx); -+fail: -+ v3d_job_put(&job->base); -+ v3d_job_put(clean_job); -+ -+ return ret; -+} -+ - int - v3d_gem_init(struct drm_device *dev) - { -@@ -816,6 +961,7 @@ v3d_gem_init(struct drm_device *dev) - mutex_init(&v3d->bo_lock); - mutex_init(&v3d->reset_lock); - mutex_init(&v3d->sched_lock); -+ mutex_init(&v3d->cache_clean_lock); - - /* Note: We don't allocate address 0. Various bits of HW - * treat 0 as special, such as the occlusion query counters ---- a/drivers/gpu/drm/v3d/v3d_irq.c -+++ b/drivers/gpu/drm/v3d/v3d_irq.c -@@ -4,9 +4,9 @@ - /** - * DOC: Interrupt management for the V3D engine - * -- * When we take a bin, render, or TFU done interrupt, we need to -- * signal the fence for that job so that the scheduler can queue up -- * the next one and unblock any waiters. -+ * When we take a bin, render, TFU done, or CSD done interrupt, we -+ * need to signal the fence for that job so that the scheduler can -+ * queue up the next one and unblock any waiters. - * - * When we take the binner out of memory interrupt, we need to - * allocate some new memory and pass it to the binner so that the -@@ -20,6 +20,7 @@ - #define V3D_CORE_IRQS ((u32)(V3D_INT_OUTOMEM | \ - V3D_INT_FLDONE | \ - V3D_INT_FRDONE | \ -+ V3D_INT_CSDDONE | \ - V3D_INT_GMPV)) - - #define V3D_HUB_IRQS ((u32)(V3D_HUB_INT_MMU_WRV | \ -@@ -108,6 +109,15 @@ v3d_irq(int irq, void *arg) - dma_fence_signal(&fence->base); - status = IRQ_HANDLED; - } -+ -+ if (intsts & V3D_INT_CSDDONE) { -+ struct v3d_fence *fence = -+ to_v3d_fence(v3d->csd_job->base.irq_fence); -+ -+ trace_v3d_csd_irq(&v3d->drm, fence->seqno); -+ dma_fence_signal(&fence->base); -+ status = IRQ_HANDLED; -+ } - - /* We shouldn't be triggering these if we have GMP in - * always-allowed mode. ---- a/drivers/gpu/drm/v3d/v3d_regs.h -+++ b/drivers/gpu/drm/v3d/v3d_regs.h -@@ -238,8 +238,11 @@ - #define V3D_CTL_L2TCACTL 0x00030 - # define V3D_L2TCACTL_TMUWCF BIT(8) - # define V3D_L2TCACTL_L2T_NO_WM BIT(4) -+/* Invalidates cache lines. */ - # define V3D_L2TCACTL_FLM_FLUSH 0 -+/* Removes cachelines without writing dirty lines back. */ - # define V3D_L2TCACTL_FLM_CLEAR 1 -+/* Writes out dirty cachelines and marks them clean, but doesn't invalidate. */ - # define V3D_L2TCACTL_FLM_CLEAN 2 - # define V3D_L2TCACTL_FLM_MASK V3D_MASK(2, 1) - # define V3D_L2TCACTL_FLM_SHIFT 1 -@@ -255,6 +258,8 @@ - #define V3D_CTL_INT_MSK_CLR 0x00064 - # define V3D_INT_QPU_MASK V3D_MASK(27, 16) - # define V3D_INT_QPU_SHIFT 16 -+# define V3D_INT_CSDDONE BIT(7) -+# define V3D_INT_PCTR BIT(6) - # define V3D_INT_GMPV BIT(5) - # define V3D_INT_TRFB BIT(4) - # define V3D_INT_SPILLUSE BIT(3) -@@ -374,4 +379,72 @@ - #define V3D_GMP_PRESERVE_LOAD 0x00818 - #define V3D_GMP_VALID_LINES 0x00820 - -+#define V3D_CSD_STATUS 0x00900 -+# define V3D_CSD_STATUS_NUM_COMPLETED_MASK V3D_MASK(11, 4) -+# define V3D_CSD_STATUS_NUM_COMPLETED_SHIFT 4 -+# define V3D_CSD_STATUS_NUM_ACTIVE_MASK V3D_MASK(3, 2) -+# define V3D_CSD_STATUS_NUM_ACTIVE_SHIFT 2 -+# define V3D_CSD_STATUS_HAVE_CURRENT_DISPATCH BIT(1) -+# define V3D_CSD_STATUS_HAVE_QUEUED_DISPATCH BIT(0) -+ -+#define V3D_CSD_QUEUED_CFG0 0x00904 -+# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_MASK V3D_MASK(31, 16) -+# define V3D_CSD_QUEUED_CFG0_NUM_WGS_X_SHIFT 16 -+# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_MASK V3D_MASK(15, 0) -+# define V3D_CSD_QUEUED_CFG0_WG_X_OFFSET_SHIFT 0 -+ -+#define V3D_CSD_QUEUED_CFG1 0x00908 -+# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_MASK V3D_MASK(31, 16) -+# define V3D_CSD_QUEUED_CFG1_NUM_WGS_Y_SHIFT 16 -+# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_MASK V3D_MASK(15, 0) -+# define V3D_CSD_QUEUED_CFG1_WG_Y_OFFSET_SHIFT 0 -+ -+#define V3D_CSD_QUEUED_CFG2 0x0090c -+# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_MASK V3D_MASK(31, 16) -+# define V3D_CSD_QUEUED_CFG2_NUM_WGS_Z_SHIFT 16 -+# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_MASK V3D_MASK(15, 0) -+# define V3D_CSD_QUEUED_CFG2_WG_Z_OFFSET_SHIFT 0 -+ -+#define V3D_CSD_QUEUED_CFG3 0x00910 -+# define V3D_CSD_QUEUED_CFG3_OVERLAP_WITH_PREV BIT(26) -+# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_MASK V3D_MASK(25, 20) -+# define V3D_CSD_QUEUED_CFG3_MAX_SG_ID_SHIFT 20 -+# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_MASK V3D_MASK(19, 12) -+# define V3D_CSD_QUEUED_CFG3_BATCHES_PER_SG_M1_SHIFT 12 -+# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_MASK V3D_MASK(11, 8) -+# define V3D_CSD_QUEUED_CFG3_WGS_PER_SG_SHIFT 8 -+# define V3D_CSD_QUEUED_CFG3_WG_SIZE_MASK V3D_MASK(7, 0) -+# define V3D_CSD_QUEUED_CFG3_WG_SIZE_SHIFT 0 -+ -+/* Number of batches, minus 1 */ -+#define V3D_CSD_QUEUED_CFG4 0x00914 -+ -+/* Shader address, pnan, singleseg, threading, like a shader record. */ -+#define V3D_CSD_QUEUED_CFG5 0x00918 -+ -+/* Uniforms address (4 byte aligned) */ -+#define V3D_CSD_QUEUED_CFG6 0x0091c -+ -+#define V3D_CSD_CURRENT_CFG0 0x00920 -+#define V3D_CSD_CURRENT_CFG1 0x00924 -+#define V3D_CSD_CURRENT_CFG2 0x00928 -+#define V3D_CSD_CURRENT_CFG3 0x0092c -+#define V3D_CSD_CURRENT_CFG4 0x00930 -+#define V3D_CSD_CURRENT_CFG5 0x00934 -+#define V3D_CSD_CURRENT_CFG6 0x00938 -+ -+#define V3D_CSD_CURRENT_ID0 0x0093c -+# define V3D_CSD_CURRENT_ID0_WG_X_MASK V3D_MASK(31, 16) -+# define V3D_CSD_CURRENT_ID0_WG_X_SHIFT 16 -+# define V3D_CSD_CURRENT_ID0_WG_IN_SG_MASK V3D_MASK(11, 8) -+# define V3D_CSD_CURRENT_ID0_WG_IN_SG_SHIFT 8 -+# define V3D_CSD_CURRENT_ID0_L_IDX_MASK V3D_MASK(7, 0) -+# define V3D_CSD_CURRENT_ID0_L_IDX_SHIFT 0 -+ -+#define V3D_CSD_CURRENT_ID1 0x00940 -+# define V3D_CSD_CURRENT_ID0_WG_Z_MASK V3D_MASK(31, 16) -+# define V3D_CSD_CURRENT_ID0_WG_Z_SHIFT 16 -+# define V3D_CSD_CURRENT_ID0_WG_Y_MASK V3D_MASK(15, 0) -+# define V3D_CSD_CURRENT_ID0_WG_Y_SHIFT 0 -+ - #endif /* V3D_REGS_H */ ---- a/drivers/gpu/drm/v3d/v3d_sched.c -+++ b/drivers/gpu/drm/v3d/v3d_sched.c -@@ -48,6 +48,12 @@ to_tfu_job(struct drm_sched_job *sched_j - return container_of(sched_job, struct v3d_tfu_job, base.base); - } - -+static struct v3d_csd_job * -+to_csd_job(struct drm_sched_job *sched_job) -+{ -+ return container_of(sched_job, struct v3d_csd_job, base.base); -+} -+ - static void - v3d_job_free(struct drm_sched_job *sched_job) - { -@@ -205,6 +211,48 @@ v3d_tfu_job_run(struct drm_sched_job *sc - return fence; - } - -+static struct dma_fence * -+v3d_csd_job_run(struct drm_sched_job *sched_job) -+{ -+ struct v3d_csd_job *job = to_csd_job(sched_job); -+ struct v3d_dev *v3d = job->base.v3d; -+ struct drm_device *dev = &v3d->drm; -+ struct dma_fence *fence; -+ int i; -+ -+ v3d->csd_job = job; -+ -+ v3d_invalidate_caches(v3d); -+ -+ fence = v3d_fence_create(v3d, V3D_CSD); -+ if (IS_ERR(fence)) -+ return NULL; -+ -+ if (job->base.irq_fence) -+ dma_fence_put(job->base.irq_fence); -+ job->base.irq_fence = dma_fence_get(fence); -+ -+ trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno); -+ -+ for (i = 1; i <= 6; i++) -+ V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]); -+ /* CFG0 write kicks off the job. */ -+ V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]); -+ -+ return fence; -+} -+ -+static struct dma_fence * -+v3d_cache_clean_job_run(struct drm_sched_job *sched_job) -+{ -+ struct v3d_job *job = to_v3d_job(sched_job); -+ struct v3d_dev *v3d = job->v3d; -+ -+ v3d_clean_caches(v3d); -+ -+ return NULL; -+} -+ - static void - v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) - { -@@ -277,13 +325,31 @@ v3d_render_job_timedout(struct drm_sched - } - - static void --v3d_tfu_job_timedout(struct drm_sched_job *sched_job) -+v3d_generic_job_timedout(struct drm_sched_job *sched_job) - { - struct v3d_job *job = to_v3d_job(sched_job); - - v3d_gpu_reset_for_timeout(job->v3d, sched_job); - } - -+static void -+v3d_csd_job_timedout(struct drm_sched_job *sched_job) -+{ -+ struct v3d_csd_job *job = to_csd_job(sched_job); -+ struct v3d_dev *v3d = job->base.v3d; -+ u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4); -+ -+ /* If we've made progress, skip reset and let the timer get -+ * rearmed. -+ */ -+ if (job->timedout_batches != batches) { -+ job->timedout_batches = batches; -+ return; -+ } -+ -+ v3d_gpu_reset_for_timeout(v3d, sched_job); -+} -+ - static const struct drm_sched_backend_ops v3d_bin_sched_ops = { - .dependency = v3d_job_dependency, - .run_job = v3d_bin_job_run, -@@ -301,10 +367,24 @@ static const struct drm_sched_backend_op - static const struct drm_sched_backend_ops v3d_tfu_sched_ops = { - .dependency = v3d_job_dependency, - .run_job = v3d_tfu_job_run, -- .timedout_job = v3d_tfu_job_timedout, -+ .timedout_job = v3d_generic_job_timedout, - .free_job = v3d_job_free, - }; - -+static const struct drm_sched_backend_ops v3d_csd_sched_ops = { -+ .dependency = v3d_job_dependency, -+ .run_job = v3d_csd_job_run, -+ .timedout_job = v3d_csd_job_timedout, -+ .free_job = v3d_job_free -+}; -+ -+static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = { -+ .dependency = v3d_job_dependency, -+ .run_job = v3d_cache_clean_job_run, -+ .timedout_job = v3d_generic_job_timedout, -+ .free_job = v3d_job_free -+}; -+ - int - v3d_sched_init(struct v3d_dev *v3d) - { -@@ -331,7 +411,7 @@ v3d_sched_init(struct v3d_dev *v3d) - if (ret) { - dev_err(v3d->dev, "Failed to create render scheduler: %d.", - ret); -- drm_sched_fini(&v3d->queue[V3D_BIN].sched); -+ v3d_sched_fini(v3d); - return ret; - } - -@@ -343,11 +423,36 @@ v3d_sched_init(struct v3d_dev *v3d) - if (ret) { - dev_err(v3d->dev, "Failed to create TFU scheduler: %d.", - ret); -- drm_sched_fini(&v3d->queue[V3D_RENDER].sched); -- drm_sched_fini(&v3d->queue[V3D_BIN].sched); -+ v3d_sched_fini(v3d); - return ret; - } - -+ if (v3d_has_csd(v3d)) { -+ ret = drm_sched_init(&v3d->queue[V3D_CSD].sched, -+ &v3d_csd_sched_ops, -+ hw_jobs_limit, job_hang_limit, -+ msecs_to_jiffies(hang_limit_ms), -+ "v3d_csd"); -+ if (ret) { -+ dev_err(v3d->dev, "Failed to create CSD scheduler: %d.", -+ ret); -+ v3d_sched_fini(v3d); -+ return ret; -+ } -+ -+ ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched, -+ &v3d_cache_clean_sched_ops, -+ hw_jobs_limit, job_hang_limit, -+ msecs_to_jiffies(hang_limit_ms), -+ "v3d_cache_clean"); -+ if (ret) { -+ dev_err(v3d->dev, "Failed to create CACHE_CLEAN scheduler: %d.", -+ ret); -+ v3d_sched_fini(v3d); -+ return ret; -+ } -+ } -+ - return 0; - } - -@@ -356,6 +461,8 @@ v3d_sched_fini(struct v3d_dev *v3d) - { - enum v3d_queue q; - -- for (q = 0; q < V3D_MAX_QUEUES; q++) -- drm_sched_fini(&v3d->queue[q].sched); -+ for (q = 0; q < V3D_MAX_QUEUES; q++) { -+ if (v3d->queue[q].sched.ops) -+ drm_sched_fini(&v3d->queue[q].sched); -+ } - } ---- a/drivers/gpu/drm/v3d/v3d_trace.h -+++ b/drivers/gpu/drm/v3d/v3d_trace.h -@@ -124,6 +124,26 @@ TRACE_EVENT(v3d_tfu_irq, - __entry->seqno) - ); - -+TRACE_EVENT(v3d_csd_irq, -+ TP_PROTO(struct drm_device *dev, -+ uint64_t seqno), -+ TP_ARGS(dev, seqno), -+ -+ TP_STRUCT__entry( -+ __field(u32, dev) -+ __field(u64, seqno) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = dev->primary->index; -+ __entry->seqno = seqno; -+ ), -+ -+ TP_printk("dev=%u, seqno=%llu", -+ __entry->dev, -+ __entry->seqno) -+); -+ - TRACE_EVENT(v3d_submit_tfu_ioctl, - TP_PROTO(struct drm_device *dev, u32 iia), - TP_ARGS(dev, iia), -@@ -163,6 +183,80 @@ TRACE_EVENT(v3d_submit_tfu, - __entry->seqno) - ); - -+TRACE_EVENT(v3d_submit_csd_ioctl, -+ TP_PROTO(struct drm_device *dev, u32 cfg5, u32 cfg6), -+ TP_ARGS(dev, cfg5, cfg6), -+ -+ TP_STRUCT__entry( -+ __field(u32, dev) -+ __field(u32, cfg5) -+ __field(u32, cfg6) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = dev->primary->index; -+ __entry->cfg5 = cfg5; -+ __entry->cfg6 = cfg6; -+ ), -+ -+ TP_printk("dev=%u, CFG5 0x%08x, CFG6 0x%08x", -+ __entry->dev, -+ __entry->cfg5, -+ __entry->cfg6) -+); -+ -+TRACE_EVENT(v3d_submit_csd, -+ TP_PROTO(struct drm_device *dev, -+ uint64_t seqno), -+ TP_ARGS(dev, seqno), -+ -+ TP_STRUCT__entry( -+ __field(u32, dev) -+ __field(u64, seqno) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = dev->primary->index; -+ __entry->seqno = seqno; -+ ), -+ -+ TP_printk("dev=%u, seqno=%llu", -+ __entry->dev, -+ __entry->seqno) -+); -+ -+TRACE_EVENT(v3d_cache_clean_begin, -+ TP_PROTO(struct drm_device *dev), -+ TP_ARGS(dev), -+ -+ TP_STRUCT__entry( -+ __field(u32, dev) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = dev->primary->index; -+ ), -+ -+ TP_printk("dev=%u", -+ __entry->dev) -+); -+ -+TRACE_EVENT(v3d_cache_clean_end, -+ TP_PROTO(struct drm_device *dev), -+ TP_ARGS(dev), -+ -+ TP_STRUCT__entry( -+ __field(u32, dev) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = dev->primary->index; -+ ), -+ -+ TP_printk("dev=%u", -+ __entry->dev) -+); -+ - TRACE_EVENT(v3d_reset_begin, - TP_PROTO(struct drm_device *dev), - TP_ARGS(dev), ---- a/include/uapi/drm/v3d_drm.h -+++ b/include/uapi/drm/v3d_drm.h -@@ -37,6 +37,7 @@ extern "C" { - #define DRM_V3D_GET_PARAM 0x04 - #define DRM_V3D_GET_BO_OFFSET 0x05 - #define DRM_V3D_SUBMIT_TFU 0x06 -+#define DRM_V3D_SUBMIT_CSD 0x07 - - #define DRM_IOCTL_V3D_SUBMIT_CL DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl) - #define DRM_IOCTL_V3D_WAIT_BO DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo) -@@ -45,6 +46,7 @@ extern "C" { - #define DRM_IOCTL_V3D_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param) - #define DRM_IOCTL_V3D_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset) - #define DRM_IOCTL_V3D_SUBMIT_TFU DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu) -+#define DRM_IOCTL_V3D_SUBMIT_CSD DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd) - - /** - * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D -@@ -172,6 +174,7 @@ enum drm_v3d_param { - DRM_V3D_PARAM_V3D_CORE0_IDENT1, - DRM_V3D_PARAM_V3D_CORE0_IDENT2, - DRM_V3D_PARAM_SUPPORTS_TFU, -+ DRM_V3D_PARAM_SUPPORTS_CSD, - }; - - struct drm_v3d_get_param { -@@ -212,6 +215,31 @@ struct drm_v3d_submit_tfu { - __u32 out_sync; - }; - -+/* Submits a compute shader for dispatch. This job will block on any -+ * previous compute shaders submitted on this fd, and any other -+ * synchronization must be performed with in_sync/out_sync. -+ */ -+struct drm_v3d_submit_csd { -+ __u32 cfg[7]; -+ __u32 coef[4]; -+ -+ /* Pointer to a u32 array of the BOs that are referenced by the job. -+ */ -+ __u64 bo_handles; -+ -+ /* Number of BO handles passed in (size is that times 4). */ -+ __u32 bo_handle_count; -+ -+ /* sync object to block on before running the CSD job. Each -+ * CSD job will execute in the order submitted to its FD. -+ * Synchronization against rendering/TFU jobs or CSD from -+ * other fds requires using sync objects. -+ */ -+ __u32 in_sync; -+ /* Sync object to signal when the CSD job is done. */ -+ __u32 out_sync; -+}; -+ - #if defined(__cplusplus) - } - #endif |