1 files changed, 409 insertions, 0 deletions
diff --git a/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch b/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch
new file mode 100644
index 0000000000..cec78b26e2
--- /dev/null
+++ b/target/linux/bcm27xx/patches-5.15/950-0742-drm-vc4-hvs-Defer-dlist-slots-deallocation.patch
@@ -0,0 +1,409 @@
+From a09f1129301f85dfb2f800c6dab2a5f8acac8f27 Mon Sep 17 00:00:00 2001
+From: Maxime Ripard <maxime@cerno.tech>
+Date: Thu, 16 Dec 2021 14:54:54 +0100
+Subject: [PATCH] drm/vc4: hvs: Defer dlist slots deallocation
+
+During normal operations, the cursor position update is done through an
+asynchronous plane update, which on the vc4 driver basically just
+modifies the right dlist word to move the plane to the new coordinates.
+
+However, when we have the overscan margins setup, we fall back to a
+regular commit when we are next to the edges. And since that commit
+happens to be on a cursor plane, it's considered a legacy cursor update
+by KMS.
+
+The main difference it makes is that it won't wait for its completion
+(ie, next vblank) before returning. This means if we have multiple
+commits happening in rapid succession, we can have several of them
+happening before the next vblank.
+
+In parallel, our dlist allocation is tied to a CRTC state, and each time
+we do a commit we end up with a new CRTC state, with the previous one
+being freed. This means that we free our previous dlist entry (but don't
+clear it though) every time a new one is being committed.
+
+Now, if we were to have two commits happening before the next vblank, we
+could end up freeing reusing the same dlist entries before the next
+vblank.
+
+Indeed, we would start from an initial state taking, for example, the
+dlist entries 10 to 20, then start a commit taking the entries 20 to 30
+and setting the dlist pointer to 20, and freeing the dlist entries 10 to
+20. However, since we haven't reach vblank yet, the HVS is still using
+the entries 10 to 20.
+
+If we were to make a new commit now, chances are the allocator are going
+to give the 10 to 20 entries back, and we would change their content to
+match the new state. If vblank hasn't happened yet, we just corrupted
+the active dlist entries.
+
+A first attempt to solve this was made by creating an intermediate dlist
+buffer to store the current (ie, as of the last commit) dlist content,
+that we would update each time the HVS is done with a frame. However, if
+the interrupt handler missed the vblank window, we would end up copying
+our intermediate dlist to the hardware one during the composition,
+essentially creating the same issue.
+
+Since making sure that our interrupt handler runs within a fixed,
+constrained, time window would require to make Linux a real-time kernel,
+this seems a bit out of scope.
+
+Instead, we can work around our original issue by keeping the dlist
+slots allocation longer. That way, we won't reuse a dlist slot while
+it's still in flight. In order to achieve this, instead of freeing the
+dlist slot when its associated CRTC state is destroyed, we'll queue it
+in a list.
+
+A naive implementation would free the buffers in that queue when we get
+our end of frame interrupt. However, there's still a race since, just
+like in the shadow dlist case, we don't control when the handler for
+that interrupt is going to run. Thus, we can end up with a commit adding
+an old dlist allocation to our queue during the window between our
+actual interrupt and when our handler will run. And since that buffer is
+still being used for the composition of the current frame, we can't free
+it right away, exposing us to the original bug.
+
+Fortunately for us, the hardware provides a frame counter that is
+increased each time the first line of a frame is being generated.
+Associating the frame counter the image is supposed to go away to the
+allocation, and then only deallocate buffers that have a counter below
+or equal to the one we see when the deallocation code should prevent the
+above race from occuring.
+
+Signed-off-by: Maxime Ripard <maxime@cerno.tech>
+---
+ drivers/gpu/drm/vc4/vc4_crtc.c |  10 +-
+ drivers/gpu/drm/vc4/vc4_drv.h  |  15 ++-
+ drivers/gpu/drm/vc4/vc4_hvs.c  | 181 ++++++++++++++++++++++++++++++---
+ drivers/gpu/drm/vc4/vc4_regs.h |   1 +
+ 4 files changed, 184 insertions(+), 23 deletions(-)
+
+--- a/drivers/gpu/drm/vc4/vc4_crtc.c
++++ b/drivers/gpu/drm/vc4/vc4_crtc.c
+@@ -959,14 +959,8 @@ void vc4_crtc_destroy_state(struct drm_c
+ 	struct vc4_dev *vc4 = to_vc4_dev(crtc->dev);
+ 	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(state);
+ 
+-	if (drm_mm_node_allocated(&vc4_state->mm)) {
+-		unsigned long flags;
+-
+-		spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
+-		drm_mm_remove_node(&vc4_state->mm);
+-		spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
+-
+-	}
++	vc4_hvs_mark_dlist_entry_stale(vc4->hvs, vc4_state->mm);
++	vc4_state->mm = NULL;
+ 
+ 	drm_atomic_helper_crtc_destroy_state(crtc, state);
+ }
+--- a/drivers/gpu/drm/vc4/vc4_drv.h
++++ b/drivers/gpu/drm/vc4/vc4_drv.h
+@@ -335,6 +335,9 @@ struct vc4_hvs {
+ 	struct drm_mm lbm_mm;
+ 	spinlock_t mm_lock;
+ 
++	struct list_head stale_dlist_entries;
++	struct work_struct free_dlist_work;
++
+ 	struct drm_mm_node mitchell_netravali_filter;
+ 
+ 	struct debugfs_regset32 regset;
+@@ -573,10 +576,16 @@ struct drm_connector *vc4_get_crtc_conne
+ struct drm_encoder *vc4_get_crtc_encoder(struct drm_crtc *crtc,
+ 					 struct drm_crtc_state *state);
+ 
++struct vc4_hvs_dlist_allocation {
++	struct list_head node;
++	struct drm_mm_node mm_node;
++	unsigned int channel;
++	u8 target_frame_count;
++};
++
+ struct vc4_crtc_state {
+ 	struct drm_crtc_state base;
+-	/* Dlist area for this CRTC configuration. */
+-	struct drm_mm_node mm;
++	struct vc4_hvs_dlist_allocation *mm;
+ 	bool txp_armed;
+ 	unsigned int assigned_channel;
+ 
+@@ -968,6 +977,8 @@ extern struct platform_driver vc4_hvs_dr
+ void vc4_hvs_stop_channel(struct vc4_hvs *hvs, unsigned int output);
+ int vc4_hvs_get_fifo_from_output(struct vc4_hvs *hvs, unsigned int output);
+ u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo);
++void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs,
++				    struct vc4_hvs_dlist_allocation *alloc);
+ int vc4_hvs_atomic_check(struct drm_crtc *crtc, struct drm_atomic_state *state);
+ void vc4_hvs_atomic_begin(struct drm_crtc *crtc, struct drm_atomic_state *state);
+ void vc4_hvs_atomic_enable(struct drm_crtc *crtc, struct drm_atomic_state *state);
+--- a/drivers/gpu/drm/vc4/vc4_hvs.c
++++ b/drivers/gpu/drm/vc4/vc4_hvs.c
+@@ -389,6 +389,150 @@ static void vc5_hvs_update_gamma_lut(str
+ 	vc5_hvs_lut_load(hvs, vc4_crtc);
+ }
+ 
++static void vc4_hvs_irq_enable_eof(const struct vc4_hvs *hvs,
++				   unsigned int channel)
++{
++	u32 irq_mask = hvs->hvs5 ?
++		SCALER5_DISPCTRL_DSPEIEOF(channel) :
++		SCALER_DISPCTRL_DSPEIEOF(channel);
++
++	HVS_WRITE(SCALER_DISPCTRL,
++		  HVS_READ(SCALER_DISPCTRL) | irq_mask);
++}
++
++static void vc4_hvs_irq_clear_eof(const struct vc4_hvs *hvs,
++				  unsigned int channel)
++{
++	u32 irq_mask = hvs->hvs5 ?
++		SCALER5_DISPCTRL_DSPEIEOF(channel) :
++		SCALER_DISPCTRL_DSPEIEOF(channel);
++
++	HVS_WRITE(SCALER_DISPCTRL,
++		  HVS_READ(SCALER_DISPCTRL) & ~irq_mask);
++}
++
++static struct vc4_hvs_dlist_allocation *
++vc4_hvs_alloc_dlist_entry(struct vc4_hvs *hvs,
++			  unsigned int channel,
++			  size_t dlist_count)
++{
++	struct vc4_hvs_dlist_allocation *alloc;
++	unsigned long flags;
++	int ret;
++
++	if (channel == VC4_HVS_CHANNEL_DISABLED)
++		return NULL;
++
++	alloc = kzalloc(sizeof(*alloc), GFP_KERNEL);
++	if (!alloc)
++		return ERR_PTR(-ENOMEM);
++
++	spin_lock_irqsave(&hvs->mm_lock, flags);
++	ret = drm_mm_insert_node(&hvs->dlist_mm, &alloc->mm_node,
++				 dlist_count);
++	spin_unlock_irqrestore(&hvs->mm_lock, flags);
++	if (ret)
++		return ERR_PTR(ret);
++
++	alloc->channel = channel;
++
++	return alloc;
++}
++
++void vc4_hvs_mark_dlist_entry_stale(struct vc4_hvs *hvs,
++				    struct vc4_hvs_dlist_allocation *alloc)
++{
++	unsigned long flags;
++	u8 frcnt;
++
++	if (!alloc)
++		return;
++
++	if (!drm_mm_node_allocated(&alloc->mm_node))
++		return;
++
++	frcnt = vc4_hvs_get_fifo_frame_count(hvs, alloc->channel);
++	alloc->target_frame_count = (frcnt + 1) & ((1 << 6) - 1);
++
++	spin_lock_irqsave(&hvs->mm_lock, flags);
++
++	list_add_tail(&alloc->node, &hvs->stale_dlist_entries);
++
++	HVS_WRITE(SCALER_DISPSTAT, SCALER_DISPSTAT_EOF(alloc->channel));
++	vc4_hvs_irq_enable_eof(hvs, alloc->channel);
++
++	spin_unlock_irqrestore(&hvs->mm_lock, flags);
++}
++
++static void vc4_hvs_schedule_dlist_sweep(struct vc4_hvs *hvs,
++					 unsigned int channel)
++{
++	unsigned long flags;
++
++	spin_lock_irqsave(&hvs->mm_lock, flags);
++
++	if (!list_empty(&hvs->stale_dlist_entries))
++		queue_work(system_unbound_wq, &hvs->free_dlist_work);
++
++	vc4_hvs_irq_clear_eof(hvs, channel);
++
++	spin_unlock_irqrestore(&hvs->mm_lock, flags);
++}
++
++/*
++ * Frame counts are essentially sequence numbers over 6 bits, and we
++ * thus can use sequence number arithmetic and follow the RFC1982 to
++ * implement proper comparison between them.
++ */
++static bool vc4_hvs_frcnt_lte(u8 cnt1, u8 cnt2)
++{
++	return (s8)((cnt1 << 2) - (cnt2 << 2)) <= 0;
++}
++
++/*
++ * Some atomic commits (legacy cursor updates, mostly) will not wait for
++ * the next vblank and will just return once the commit has been pushed
++ * to the hardware.
++ *
++ * On the hardware side, our HVS stores the planes parameters in its
++ * context RAM, and will use part of the RAM to store data during the
++ * frame rendering.
++ *
++ * This interacts badly if we get multiple commits before the next
++ * vblank since we could end up overwriting the DLIST entries used by
++ * previous commits if our dlist allocation reuses that entry. In such a
++ * case, we would overwrite the data currently being used by the
++ * hardware, resulting in a corrupted frame.
++ *
++ * In order to work around this, we'll queue the dlist entries in a list
++ * once the associated CRTC state is destroyed. The HVS only allows us
++ * to know which entry is being active, but not which one are no longer
++ * being used, so in order to avoid freeing entries that are still used
++ * by the hardware we add a guesstimate of the frame count where our
++ * entry will no longer be used, and thus will only free those entries
++ * when we will have reached that frame count.
++ */
++static void vc4_hvs_dlist_free_work(struct work_struct *work)
++{
++	struct vc4_hvs *hvs = container_of(work, struct vc4_hvs, free_dlist_work);
++	struct vc4_hvs_dlist_allocation *cur, *next;
++	unsigned long flags;
++
++	spin_lock_irqsave(&hvs->mm_lock, flags);
++	list_for_each_entry_safe(cur, next, &hvs->stale_dlist_entries, node) {
++		u8 frcnt;
++
++		frcnt = vc4_hvs_get_fifo_frame_count(hvs, cur->channel);
++		if (!vc4_hvs_frcnt_lte(cur->target_frame_count, frcnt))
++			continue;
++
++		list_del(&cur->node);
++		drm_mm_remove_node(&cur->mm_node);
++		kfree(cur);
++	}
++	spin_unlock_irqrestore(&hvs->mm_lock, flags);
++}
++
+ u8 vc4_hvs_get_fifo_frame_count(struct vc4_hvs *hvs, unsigned int fifo)
+ {
+ 	u8 field = 0;
+@@ -588,13 +732,12 @@ int vc4_hvs_atomic_check(struct drm_crtc
+ {
+ 	struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc);
+ 	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc_state);
++	struct vc4_hvs_dlist_allocation *alloc;
+ 	struct drm_device *dev = crtc->dev;
+ 	struct vc4_dev *vc4 = to_vc4_dev(dev);
+ 	struct drm_plane *plane;
+-	unsigned long flags;
+ 	const struct drm_plane_state *plane_state;
+ 	u32 dlist_count = 0;
+-	int ret;
+ 
+ 	/* The pixelvalve can only feed one encoder (and encoders are
+ 	 * 1:1 with connectors.)
+@@ -607,12 +750,11 @@ int vc4_hvs_atomic_check(struct drm_crtc
+ 
+ 	dlist_count++; /* Account for SCALER_CTL0_END. */
+ 
+-	spin_lock_irqsave(&vc4->hvs->mm_lock, flags);
+-	ret = drm_mm_insert_node(&vc4->hvs->dlist_mm, &vc4_state->mm,
+-				 dlist_count);
+-	spin_unlock_irqrestore(&vc4->hvs->mm_lock, flags);
+-	if (ret)
+-		return ret;
++	alloc = vc4_hvs_alloc_dlist_entry(vc4->hvs, vc4_state->assigned_channel, dlist_count);
++	if (IS_ERR(alloc))
++		return PTR_ERR(alloc);
++
++	vc4_state->mm = alloc;
+ 
+ 	return vc4_hvs_gamma_check(crtc, state);
+ }
+@@ -624,8 +766,9 @@ static void vc4_hvs_install_dlist(struct
+ 	struct vc4_hvs *hvs = vc4->hvs;
+ 	struct vc4_crtc_state *vc4_state = to_vc4_crtc_state(crtc->state);
+ 
++	WARN_ON(!vc4_state->mm);
+ 	HVS_WRITE(SCALER_DISPLISTX(vc4_state->assigned_channel),
+-		  vc4_state->mm.start);
++		  vc4_state->mm->mm_node.start);
+ }
+ 
+ static void vc4_hvs_update_dlist(struct drm_crtc *crtc)
+@@ -650,8 +793,10 @@ static void vc4_hvs_update_dlist(struct
+ 		spin_unlock_irqrestore(&dev->event_lock, flags);
+ 	}
+ 
++	WARN_ON(!vc4_state->mm);
++
+ 	spin_lock_irqsave(&vc4_crtc->irq_lock, flags);
+-	vc4_crtc->current_dlist = vc4_state->mm.start;
++	vc4_crtc->current_dlist = vc4_state->mm->mm_node.start;
+ 	spin_unlock_irqrestore(&vc4_crtc->irq_lock, flags);
+ }
+ 
+@@ -708,8 +853,7 @@ void vc4_hvs_atomic_flush(struct drm_crt
+ 	struct vc4_plane_state *vc4_plane_state;
+ 	bool debug_dump_regs = false;
+ 	bool enable_bg_fill = false;
+-	u32 __iomem *dlist_start = vc4->hvs->dlist + vc4_state->mm.start;
+-	u32 __iomem *dlist_next = dlist_start;
++	u32 __iomem *dlist_start, *dlist_next;
+ 
+ 	if (vc4_state->assigned_channel == VC4_HVS_CHANNEL_DISABLED)
+ 		return;
+@@ -719,6 +863,9 @@ void vc4_hvs_atomic_flush(struct drm_crt
+ 		vc4_hvs_dump_state(hvs);
+ 	}
+ 
++	dlist_start = vc4->hvs->dlist + vc4_state->mm->mm_node.start;
++	dlist_next = dlist_start;
++
+ 	/* Copy all the active planes' dlist contents to the hardware dlist. */
+ 	drm_atomic_crtc_for_each_plane(plane, crtc) {
+ 		/* Is this the first active plane? */
+@@ -741,7 +888,8 @@ void vc4_hvs_atomic_flush(struct drm_crt
+ 	writel(SCALER_CTL0_END, dlist_next);
+ 	dlist_next++;
+ 
+-	WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm.size);
++	WARN_ON(!vc4_state->mm);
++	WARN_ON_ONCE(dlist_next - dlist_start != vc4_state->mm->mm_node.size);
+ 
+ 	if (enable_bg_fill)
+ 		/* This sets a black background color fill, as is the case
+@@ -846,6 +994,11 @@ static irqreturn_t vc4_hvs_irq_handler(i
+ 
+ 			irqret = IRQ_HANDLED;
+ 		}
++
++		if (status & SCALER_DISPSTAT_EOF(channel)) {
++			vc4_hvs_schedule_dlist_sweep(hvs, channel);
++			irqret = IRQ_HANDLED;
++		}
+ 	}
+ 
+ 	/* Clear every per-channel interrupt flag. */
+@@ -902,6 +1055,8 @@ static int vc4_hvs_bind(struct device *d
+ 		hvs->dlist = hvs->regs + SCALER5_DLIST_START;
+ 
+ 	spin_lock_init(&hvs->mm_lock);
++	INIT_LIST_HEAD(&hvs->stale_dlist_entries);
++	INIT_WORK(&hvs->free_dlist_work, vc4_hvs_dlist_free_work);
+ 
+ 	/* Set up the HVS display list memory manager.  We never
+ 	 * overwrite the setup from the bootloader (just 128b out of
+--- a/drivers/gpu/drm/vc4/vc4_regs.h
++++ b/drivers/gpu/drm/vc4/vc4_regs.h
+@@ -234,6 +234,7 @@
+ # define SCALER_DISPCTRL_DSPEIEOLN(x)		BIT(8 + ((x) * 2))
+ /* Enables Display 0 EOF contribution to SCALER_DISPSTAT_IRQDISP0 */
+ # define SCALER_DISPCTRL_DSPEIEOF(x)		BIT(7 + ((x) * 2))
++# define SCALER5_DISPCTRL_DSPEIEOF(x)		BIT(7 + ((x) * 4))
+ 
+ # define SCALER_DISPCTRL_SLVRDEIRQ		BIT(6)
+ # define SCALER_DISPCTRL_SLVWREIRQ		BIT(5)