aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/brcm2708/patches-3.10/0070-bcm2708_fb-DMA-acceleration-for-fb_copyarea.patch
blob: f7a3051b2c621e38062ce1373bd432fa3b45418a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
From ba65074e39e6aee492bd3c077f640b29a0a89c05 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Mon, 17 Jun 2013 16:00:25 +0300
Subject: [PATCH 070/174] bcm2708_fb: DMA acceleration for fb_copyarea

Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425
Also used Simon's dmaer_master module as a reference for tweaking DMA
settings for better performance.

For now busylooping only. IRQ support might be added later.
With non-overclocked Raspberry Pi, the performance is ~360 MB/s
for simple copy or ~260 MB/s for two-pass copy (used when dragging
windows to the right).

In the case of using DMA channel 0, the performance improves
to ~440 MB/s.

For comparison, VFP optimized CPU copy can only do ~114 MB/s in
the same conditions (hindered by reading uncached source buffer).

Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
---
 drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 3 deletions(-)

--- a/drivers/video/bcm2708_fb.c
+++ b/drivers/video/bcm2708_fb.c
@@ -28,6 +28,7 @@
 #include <linux/printk.h>
 #include <linux/console.h>
 
+#include <mach/dma.h>
 #include <mach/platform.h>
 #include <mach/vcio.h>
 
@@ -63,6 +64,11 @@ struct bcm2708_fb {
 	struct fbinfo_s *info;
 	dma_addr_t dma;
 	u32 cmap[16];
+	int dma_chan;
+	int dma_irq;
+	void __iomem *dma_chan_base;
+	void *cb_base;		/* DMA control blocks */
+	dma_addr_t cb_handle;
 };
 
 #define to_bcm2708(info)	container_of(info, struct bcm2708_fb, fb)
@@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct f
 	cfb_fillrect(info, rect);
 }
 
+/* A helper function for configuring dma control block */
+static void set_dma_cb(struct bcm2708_dma_cb *cb,
+		       int        burst_size,
+		       dma_addr_t dst,
+		       int        dst_stride,
+		       dma_addr_t src,
+		       int        src_stride,
+		       int        w,
+		       int        h)
+{
+	cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH |
+		   BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH |
+		   BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE;
+	cb->dst = dst;
+	cb->src = src;
+	/*
+	 * This is not really obvious from the DMA documentation,
+	 * but the top 16 bits must be programmmed to "height -1"
+	 * and not "height" in 2D mode.
+	 */
+	cb->length = ((h - 1) << 16) | w;
+	cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w);
+	cb->pad[0] = 0;
+	cb->pad[1] = 0;
+}
+
 static void bcm2708_fb_copyarea(struct fb_info *info,
 				const struct fb_copyarea *region)
 {
-	/*print_debug("bcm2708_fb_copyarea\n"); */
-	cfb_copyarea(info, region);
+	struct bcm2708_fb *fb = to_bcm2708(info);
+	struct bcm2708_dma_cb *cb = fb->cb_base;
+	int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3;
+	/* Channel 0 supports larger bursts and is a bit faster */
+	int burst_size = (fb->dma_chan == 0) ? 8 : 2;
+
+	/* Fallback to cfb_copyarea() if we don't like something */
+	if (bytes_per_pixel > 4 ||
+	    info->var.xres > 1920 || info->var.yres > 1200 ||
+	    region->width <= 0 || region->width > info->var.xres ||
+	    region->height <= 0 || region->height > info->var.yres ||
+	    region->sx < 0 || region->sx >= info->var.xres ||
+	    region->sy < 0 || region->sy >= info->var.yres ||
+	    region->dx < 0 || region->dx >= info->var.xres ||
+	    region->dy < 0 || region->dy >= info->var.yres ||
+	    region->sx + region->width > info->var.xres ||
+	    region->dx + region->width > info->var.xres ||
+	    region->sy + region->height > info->var.yres ||
+	    region->dy + region->height > info->var.yres) {
+		cfb_copyarea(info, region);
+		return;
+	}
+
+	if (region->dy == region->sy && region->dx > region->sx) {
+		/*
+		 * A difficult case of overlapped copy. Because DMA can't
+		 * copy individual scanlines in backwards direction, we need
+		 * two-pass processing. We do it by programming a chain of dma
+		 * control blocks in the first 16K part of the buffer and use
+		 * the remaining 48K as the intermediate temporary scratch
+		 * buffer. The buffer size is sufficient to handle up to
+		 * 1920x1200 resolution at 32bpp pixel depth.
+		 */
+		int y;
+		dma_addr_t control_block_pa = fb->cb_handle;
+		dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024;
+		int scanline_size = bytes_per_pixel * region->width;
+		int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size;
+
+		for (y = 0; y < region->height; y += scanlines_per_cb) {
+			dma_addr_t src =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->sx +
+				(region->sy + y) * fb->fb.fix.line_length;
+			dma_addr_t dst =
+				fb->fb.fix.smem_start +
+				bytes_per_pixel * region->dx +
+				(region->dy + y) * fb->fb.fix.line_length;
+
+			if (region->height - y < scanlines_per_cb)
+				scanlines_per_cb = region->height - y;
+
+			set_dma_cb(cb, burst_size, scratchbuf, scanline_size,
+				   src, fb->fb.fix.line_length,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+
+			set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length,
+				   scratchbuf, scanline_size,
+				   scanline_size, scanlines_per_cb);
+			control_block_pa += sizeof(struct bcm2708_dma_cb);
+			cb->next = control_block_pa;
+			cb++;
+		}
+		/* move the pointer back to the last dma control block */
+		cb--;
+	} else {
+		/* A single dma control block is enough. */
+		int sy, dy, stride;
+		if (region->dy <= region->sy) {
+			/* processing from top to bottom */
+			dy = region->dy;
+			sy = region->sy;
+			stride = fb->fb.fix.line_length;
+		} else {
+			/* processing from bottom to top */
+			dy = region->dy + region->height - 1;
+			sy = region->sy + region->height - 1;
+			stride = -fb->fb.fix.line_length;
+		}
+		set_dma_cb(cb, burst_size,
+			   fb->fb.fix.smem_start + dy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->dx,
+			   stride,
+			   fb->fb.fix.smem_start + sy * fb->fb.fix.line_length +
+						   bytes_per_pixel * region->sx,
+			   stride,
+			   region->width * bytes_per_pixel,
+			   region->height);
+	}
+
+	/* end of dma control blocks chain */
+	cb->next = 0;
+
+	bcm_dma_start(fb->dma_chan_base, fb->cb_handle);
+	bcm_dma_wait_idle(fb->dma_chan_base);
 }
 
 static void bcm2708_fb_imageblit(struct fb_info *info,
@@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bc
 		fb->dma = dma;
 	}
 	fb->fb.fbops = &bcm2708_fb_ops;
-	fb->fb.flags = FBINFO_FLAG_DEFAULT;
+	fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA;
 	fb->fb.pseudo_palette = fb->cmap;
 
 	strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id));
@@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platf
 	}
 	memset(fb, 0, sizeof(struct bcm2708_fb));
 
+	fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K,
+					     &fb->cb_handle, GFP_KERNEL);
+	if (!fb->cb_base) {
+		dev_err(&dev->dev, "cannot allocate DMA CBs\n");
+		ret = -ENOMEM;
+		goto free_fb;
+	}
+
+	pr_info("BCM2708FB: allocated DMA memory %08x\n",
+	       fb->cb_handle);
+
+	ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK,
+				 &fb->dma_chan_base, &fb->dma_irq);
+	if (ret < 0) {
+		dev_err(&dev->dev, "couldn't allocate a DMA channel\n");
+		goto free_cb;
+	}
+	fb->dma_chan = ret;
+
+	pr_info("BCM2708FB: allocated DMA channel %d @ %p\n",
+	       fb->dma_chan, fb->dma_chan_base);
+
 	fb->dev = dev;
 
 	ret = bcm2708_fb_register(fb);
@@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platf
 		goto out;
 	}
 
+free_cb:
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+free_fb:
 	kfree(fb);
 free_region:
 	dev_err(&dev->dev, "probe failed, err %d\n", ret);
@@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct plat
 		iounmap(fb->fb.screen_base);
 	unregister_framebuffer(&fb->fb);
 
+	dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle);
+	bcm_dma_chan_free(fb->dma_chan);
+
 	dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info,
 			  fb->dma);
 	kfree(fb);