diff options
Diffstat (limited to 'target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch')
-rw-r--r-- | target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch b/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch new file mode 100644 index 0000000000..cec78b26e2 --- /dev/null +++ b/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch @@ -0,0 +1,409 @@ +From a09f1129301f85dfb2f800c6dab2a5f8acac8f27 Mon Sep 17 00:00:00 2001 +From: Maxime Ripard <maxime@cerno.tech> +Date: Thu, 16 Dec 2021 14:54:54 +0100 +Subject: [PATCH] drm/vc4: hvs: Defer dlist slots deallocation + +During normal operations, the cursor position update is done through an +asynchronous plane update, which on the vc4 driver basically just +modifies the right dlist word to move the plane to the new coordinates. + +However, when we have the overscan margins setup, we fall back to a +regular commit when we are next to the edges. And since that commit +happens to be on a cursor plane, it's considered a legacy cursor update +by KMS. + +The main difference it makes is that it won't wait for its completion +(ie, next vblank) before returning. This means if we have multiple +commits happening in rapid succession, we can have several of them +happening before the next vblank. + +In parallel, our dlist allocation is tied to a CRTC state, and each time +we do a commit we end up with a new CRTC state, with the previous one +being freed. This means that we free our previous dlist entry (but don't +clear it though) every time a new one is being committed. + +Now, if we were to have two commits happening before the next vblank, we +could end up freeing reusing the same dlist entries before the next +vblank. + +Indeed, we would start from an initial state taking, for example, the +dlist entries 10 to 20, then start a commit taking the entries 20 to 30 +and setting the dlist pointer to 20, and freeing the dlist entries 10 to +20. However, since we haven't reach vblank yet, the HVS is still using +the entries 10 to 20. + +If we were to make a new commit now, chances are the allocator are going +to give the 10 to 20 entries back, and we would change their content to +match the new state. If vblank hasn't happened yet, we just corrupted +the active dlist entries. + +A first attempt to solve this was made by creating an intermediate dlist +buffer to store the current (ie, as of the last commit) dlist content, +that we would update each time the HVS is done with a frame. However, if +the interrupt handler missed the vblank window, we would end up copying +our intermediate dlist to the hardware one during the composition, +essentially creating the same issue. + +Since making sure that our interrupt handler runs within a fixed, +constrained, time window would require to make Linux a real-time kernel, +this seems a bit out of scope. + +Instead, we can work around our original issue by keeping the dlist +slots allocation longer. That way, we won't reuse a dlist slot while +it's still in flight. In order to achieve this, instead of freeing the +dlist slot when its associated CRTC state is destroyed, we'll queue it +in a list. + +A naive implementation would free the buffers in that queue when we get +our end of frame interrupt. However, there's still a race since, just +like in the shadow dlist case, we don't control when the handler for +that interrupt is going to run. Thus, we can end up with a commit adding +an old dlist allocation to our queue during the window between our +actual interrupt and when our handler will run. And since that buffer is +still being used for the composition of the current frame, we can't free +it right away, exposing us to the original bug. + +Fortunately for us, the hardware provides a frame counter that is +increased each time the first line of a frame is being generated. +Associating the frame counter the image is supposed to go away to the +allocation, and then only deallocate buffers that have a counter below +or equal to the one we see when the deallocation code should prevent the +above race from occuring. + +Signed-off-by: Maxime Ripard <maxime@cerno.tech> +--- + drivers/gpu/drm/vc4/vc4_crtc.c | 10 +- + drivers/gpu/drm/vc4/vc4_drv.h | 15 ++- + drivers/gpu/drm/vc4/vc4_hvs.c | 181 ++++++++++++++++++++++++++++++--- + drivers/gpu/drm/vc4/vc4_regs.h | 1 + + 4 files changed, 184 insertions(+), 23 deletions(-) + +--- a/drivers/gpu/drm/vc4/vc4_crtc.c ++++ b/drivers/gpu/drm/vc4/vc4_crtc.c +@@ -959,14 +959,8 @@ void vc4_crtc_destroy_state(struct drm_c + struct vc4_dev *vc4 = to_vc4_dev(crtc->dev); + struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(state); + +- if (drm_mm_node_allocated(&vc4_state->mm)) { +- unsigned long flags; +- +- spin_lock_irqsave(&vc4->hvs->mm_lock, flags); +- drm_mm_remove_node(&vc4_state->mm); +- spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags); +- +- } ++ vc4_hvs_mark_dlist_entry_stale(vc4->hvs, vc4_state->mm); ++ vc4_state->mm = NULL; + + drm_atomic_helper_crtc_destroy_state(crtc, state); + } +--- a/drivers/gpu/drm/vc4/vc4_drv.h ++++ b/drivers/gpu/drm/vc4/vc4_drv.h +@@ -335,6 +335,9 @@ struct vc4_hvs { + struct drm_mm lbm_mm; + spinlock_t mm_lock; + ++ struct list_head stale_dlist_entries; ++ struct work_struct free_dlist_work; ++ + struct drm_mm_node mitchell_netravali_filter; + + struct debugfs_regset32 regset; +@@ -573,10 +576,16 @@ struct drm_connector *vc4_get_crtc_conne + struct drm_encoder *vc4_get_crtc_encoder(struct drm_crtc *crtc, + struct drm_crtc_state *state); + ++struct vc4_hvs_dlist_allocation { ++ struct list_head node; ++ struct drm_mm_node mm_node; ++ unsigned int channel; ++ u8 target_frame_count; ++}; ++ + struct vc4_crtc_state { + struct drm_crtc_state base; +- /* Dlist area for this CRTC configuration. */ +- struct drm_mm_node mm; ++ struct vc4_hvs_dlist_allocation *mm; + bool txp_armed; + unsigned int assigned_channel; + +@@ -968,6 +977,8 @@ extern struct platform_driver vc4_hvs_dr + void vc4_hvs_stop_channel(struct vc4_hvs *hvs, unsigned int output); + int vc4_hvs_get_fifo_from_output(struct vc4_hvs *hvs, unsigned int output); + u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo); ++void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs, ++ struct vc4_hvs_dlist_allocation *alloc); + int vc4_hvs_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state); + void vc4_hvs_atomic_begin(struct drm_crtc *crtc, struct drm_atomic_state *state); + void vc4_hvs_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state); +--- a/drivers/gpu/drm/vc4/vc4_hvs.c ++++ b/drivers/gpu/drm/vc4/vc4_hvs.c +@@ -389,6 +389,150 @@ static void vc5_hvs_update_gamma_lut(str + vc5_hvs_lut_load(hvs, vc4_crtc); + } + ++static void vc4_hvs_irq_enable_eof(const struct vc4_hvs *hvs, ++ unsigned int channel) ++{ ++ u32 irq_mask = hvs->hvs5 ? ++ SCALER5_DISPCTRL_DSPEIEOF(channel) : ++ SCALER_DISPCTRL_DSPEIEOF(channel); ++ ++ HVS_WRITE(SCALER_DISPCTRL, ++ HVS_READ(SCALER_DISPCTRL) | irq_mask); ++} ++ ++static void vc4_hvs_irq_clear_eof(const struct vc4_hvs *hvs, ++ unsigned int channel) ++{ ++ u32 irq_mask = hvs->hvs5 ? ++ SCALER5_DISPCTRL_DSPEIEOF(channel) : ++ SCALER_DISPCTRL_DSPEIEOF(channel); ++ ++ HVS_WRITE(SCALER_DISPCTRL, ++ HVS_READ(SCALER_DISPCTRL) & ~irq_mask); ++} ++ ++static struct vc4_hvs_dlist_allocation * ++vc4_hvs_alloc_dlist_entry(struct vc4_hvs *hvs, ++ unsigned int channel, ++ size_t dlist_count) ++{ ++ struct vc4_hvs_dlist_allocation *alloc; ++ unsigned long flags; ++ int ret; ++ ++ if (channel == VC4_HVS_CHANNEL_DISABLED) ++ return NULL; ++ ++ alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); ++ if (!alloc) ++ return ERR_PTR(-ENOMEM); ++ ++ spin_lock_irqsave(&hvs->mm_lock, flags); ++ ret = drm_mm_insert_node(&hvs->dlist_mm, &alloc->mm_node, ++ dlist_count); ++ spin_unlock_irqrestore(&hvs->mm_lock, flags); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ alloc->channel = channel; ++ ++ return alloc; ++} ++ ++void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs, ++ struct vc4_hvs_dlist_allocation *alloc) ++{ ++ unsigned long flags; ++ u8 frcnt; ++ ++ if (!alloc) ++ return; ++ ++ if (!drm_mm_node_allocated(&alloc->mm_node)) ++ return; ++ ++ frcnt = vc4_hvs_get_fifo_frame_count(hvs, alloc->channel); ++ alloc->target_frame_count = (frcnt + 1) & ((1 << 6) - 1); ++ ++ spin_lock_irqsave(&hvs->mm_lock, flags); ++ ++ list_add_tail(&alloc->node, &hvs->stale_dlist_entries); ++ ++ HVS_WRITE(SCALER_DISPSTAT, SCALER_DISPSTAT_EOF(alloc->channel)); ++ vc4_hvs_irq_enable_eof(hvs, alloc->channel); ++ ++ spin_unlock_irqrestore(&hvs->mm_lock, flags); ++} ++ ++static void vc4_hvs_schedule_dlist_sweep(struct vc4_hvs *hvs, ++ unsigned int channel) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&hvs->mm_lock, flags); ++ ++ if (!list_empty(&hvs->stale_dlist_entries)) ++ queue_work(system_unbound_wq, &hvs->free_dlist_work); ++ ++ vc4_hvs_irq_clear_eof(hvs, channel); ++ ++ spin_unlock_irqrestore(&hvs->mm_lock, flags); ++} ++ ++/* ++ * Frame counts are essentially sequence numbers over 6 bits, and we ++ * thus can use sequence number arithmetic and follow the RFC1982 to ++ * implement proper comparison between them. ++ */ ++static bool vc4_hvs_frcnt_lte(u8 cnt1, u8 cnt2) ++{ ++ return (s8)((cnt1 << 2) - (cnt2 << 2)) <= 0; ++} ++ ++/* ++ * Some atomic commits (legacy cursor updates, mostly) will not wait for ++ * the next vblank and will just return once the commit has been pushed ++ * to the hardware. ++ * ++ * On the hardware side, our HVS stores the planes parameters in its ++ * context RAM, and will use part of the RAM to store data during the ++ * frame rendering. ++ * ++ * This interacts badly if we get multiple commits before the next ++ * vblank since we could end up overwriting the DLIST entries used by ++ * previous commits if our dlist allocation reuses that entry. In such a ++ * case, we would overwrite the data currently being used by the ++ * hardware, resulting in a corrupted frame. ++ * ++ * In order to work around this, we'll queue the dlist entries in a list ++ * once the associated CRTC state is destroyed. The HVS only allows us ++ * to know which entry is being active, but not which one are no longer ++ * being used, so in order to avoid freeing entries that are still used ++ * by the hardware we add a guesstimate of the frame count where our ++ * entry will no longer be used, and thus will only free those entries ++ * when we will have reached that frame count. ++ */ ++static void vc4_hvs_dlist_free_work(struct work_struct *work) ++{ ++ struct vc4_hvs *hvs = container_of(work, struct vc4_hvs, free_dlist_work); ++ struct vc4_hvs_dlist_allocation *cur, *next; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&hvs->mm_lock, flags); ++ list_for_each_entry_safe(cur, next, &hvs->stale_dlist_entries, node) { ++ u8 frcnt; ++ ++ frcnt = vc4_hvs_get_fifo_frame_count(hvs, cur->channel); ++ if (!vc4_hvs_frcnt_lte(cur->target_frame_count, frcnt)) ++ continue; ++ ++ list_del(&cur->node); ++ drm_mm_remove_node(&cur->mm_node); ++ kfree(cur); ++ } ++ spin_unlock_irqrestore(&hvs->mm_lock, flags); ++} ++ + u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo) + { + u8 field = 0; +@@ -588,13 +732,12 @@ int vc4_hvs_atomic_check(struct drm_crtc + { + struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); + struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc_state); ++ struct vc4_hvs_dlist_allocation *alloc; + struct drm_device *dev = crtc->dev; + struct vc4_dev *vc4 = to_vc4_dev(dev); + struct drm_plane *plane; +- unsigned long flags; + const struct drm_plane_state *plane_state; + u32 dlist_count = 0; +- int ret; + + /* The pixelvalve can only feed one encoder (and encoders are + * 1:1 with connectors.) +@@ -607,12 +750,11 @@ int vc4_hvs_atomic_check(struct drm_crtc + + dlist_count++; /* Account for SCALER_CTL0_END. */ + +- spin_lock_irqsave(&vc4->hvs->mm_lock, flags); +- ret = drm_mm_insert_node(&vc4->hvs->dlist_mm, &vc4_state->mm, +- dlist_count); +- spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags); +- if (ret) +- return ret; ++ alloc = vc4_hvs_alloc_dlist_entry(vc4->hvs, vc4_state->assigned_channel, dlist_count); ++ if (IS_ERR(alloc)) ++ return PTR_ERR(alloc); ++ ++ vc4_state->mm = alloc; + + return vc4_hvs_gamma_check(crtc, state); + } +@@ -624,8 +766,9 @@ static void vc4_hvs_install_dlist(struct + struct vc4_hvs *hvs = vc4->hvs; + struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc->state); + ++ WARN_ON(!vc4_state->mm); + HVS_WRITE(SCALER_DISPLISTX(vc4_state->assigned_channel), +- vc4_state->mm.start); ++ vc4_state->mm->mm_node.start); + } + + static void vc4_hvs_update_dlist(struct drm_crtc *crtc) +@@ -650,8 +793,10 @@ static void vc4_hvs_update_dlist(struct + spin_unlock_irqrestore(&dev->event_lock, flags); + } + ++ WARN_ON(!vc4_state->mm); ++ + spin_lock_irqsave(&vc4_crtc->irq_lock, flags); +- vc4_crtc->current_dlist = vc4_state->mm.start; ++ vc4_crtc->current_dlist = vc4_state->mm->mm_node.start; + spin_unlock_irqrestore(&vc4_crtc->irq_lock, flags); + } + +@@ -708,8 +853,7 @@ void vc4_hvs_atomic_flush(struct drm_crt + struct vc4_plane_state *vc4_plane_state; + bool debug_dump_regs = false; + bool enable_bg_fill = false; +- u32 __iomem *dlist_start = vc4->hvs->dlist + vc4_state->mm.start; +- u32 __iomem *dlist_next = dlist_start; ++ u32 __iomem *dlist_start, *dlist_next; + + if (vc4_state->assigned_channel == VC4_HVS_CHANNEL_DISABLED) + return; +@@ -719,6 +863,9 @@ void vc4_hvs_atomic_flush(struct drm_crt + vc4_hvs_dump_state(hvs); + } + ++ dlist_start = vc4->hvs->dlist + vc4_state->mm->mm_node.start; ++ dlist_next = dlist_start; ++ + /* Copy all the active planes' dlist contents to the hardware dlist. */ + drm_atomic_crtc_for_each_plane(plane, crtc) { + /* Is this the first active plane? */ +@@ -741,7 +888,8 @@ void vc4_hvs_atomic_flush(struct drm_crt + writel(SCALER_CTL0_END, dlist_next); + dlist_next++; + +- WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm.size); ++ WARN_ON(!vc4_state->mm); ++ WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm->mm_node.size); + + if (enable_bg_fill) + /* This sets a black background color fill, as is the case +@@ -846,6 +994,11 @@ static irqreturn_t vc4_hvs_irq_handler(i + + irqret = IRQ_HANDLED; + } ++ ++ if (status & SCALER_DISPSTAT_EOF(channel)) { ++ vc4_hvs_schedule_dlist_sweep(hvs, channel); ++ irqret = IRQ_HANDLED; ++ } + } + + /* Clear every per-channel interrupt flag. */ +@@ -902,6 +1055,8 @@ static int vc4_hvs_bind(struct device *d + hvs->dlist = hvs->regs + SCALER5_DLIST_START; + + spin_lock_init(&hvs->mm_lock); ++ INIT_LIST_HEAD(&hvs->stale_dlist_entries); ++ INIT_WORK(&hvs->free_dlist_work, vc4_hvs_dlist_free_work); + + /* Set up the HVS display list memory manager. We never + * overwrite the setup from the bootloader (just 128b out of +--- a/drivers/gpu/drm/vc4/vc4_regs.h ++++ b/drivers/gpu/drm/vc4/vc4_regs.h +@@ -234,6 +234,7 @@ + # define SCALER_DISPCTRL_DSPEIEOLN(x) BIT(8 + ((x) * 2)) + /* Enables Display 0 EOF contribution to SCALER_DISPSTAT_IRQDISP0 */ + # define SCALER_DISPCTRL_DSPEIEOF(x) BIT(7 + ((x) * 2)) ++# define SCALER5_DISPCTRL_DSPEIEOF(x) BIT(7 + ((x) * 4)) + + # define SCALER_DISPCTRL_SLVRDEIRQ BIT(6) + # define SCALER_DISPCTRL_SLVWREIRQ BIT(5) |