aboutsummaryrefslogtreecommitdiffstats
path: root/target/linux/bcm27xx/patches-5.4/950-0698-bcm2835-dma-Add-proper-40-bit-DMA-support.patch
blob: 3fb47589ee3c08b92617ee888d626495a9431a9d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
From a2f673d1aa39752608e5f4838ed9656b38cbc4b9 Mon Sep 17 00:00:00 2001
From: Phil Elwell <phil@raspberrypi.org>
Date: Thu, 4 Apr 2019 13:33:47 +0100
Subject: [PATCH] bcm2835-dma: Add proper 40-bit DMA support

BCM2711 has 4 DMA channels with a 40-bit address range, allowing them
to access the full 4GB of memory on a Pi 4.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 drivers/dma/bcm2835-dma.c | 485 ++++++++++++++++++++++++++++++++------
 1 file changed, 412 insertions(+), 73 deletions(-)

--- a/drivers/dma/bcm2835-dma.c
+++ b/drivers/dma/bcm2835-dma.c
@@ -38,6 +38,11 @@
 #define BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED 14
 #define BCM2835_DMA_CHAN_NAME_SIZE 8
 #define BCM2835_DMA_BULK_MASK  BIT(0)
+#define BCM2711_DMA_MEMCPY_CHAN 14
+
+struct bcm2835_dma_cfg_data {
+	u32	chan_40bit_mask;
+};
 
 /**
  * struct bcm2835_dmadev - BCM2835 DMA controller
@@ -52,6 +57,7 @@ struct bcm2835_dmadev {
 	void __iomem *base;
 	struct device_dma_parameters dma_parms;
 	dma_addr_t zero_page;
+	const struct bcm2835_dma_cfg_data *cfg_data;
 };
 
 struct bcm2835_dma_cb {
@@ -64,6 +70,17 @@ struct bcm2835_dma_cb {
 	uint32_t pad[2];
 };
 
+struct bcm2711_dma40_scb {
+	uint32_t ti;
+	uint32_t src;
+	uint32_t srci;
+	uint32_t dst;
+	uint32_t dsti;
+	uint32_t len;
+	uint32_t next_cb;
+	uint32_t rsvd;
+};
+
 struct bcm2835_cb_entry {
 	struct bcm2835_dma_cb *cb;
 	dma_addr_t paddr;
@@ -84,6 +101,7 @@ struct bcm2835_chan {
 	unsigned int irq_flags;
 
 	bool is_lite_channel;
+	bool is_40bit_channel;
 };
 
 struct bcm2835_desc {
@@ -173,13 +191,118 @@ struct bcm2835_desc {
 #define BCM2835_DMA_DATA_TYPE_S128	16
 
 /* Valid only for channels 0 - 14, 15 has its own base address */
-#define BCM2835_DMA_CHAN(n)	((n) << 8) /* Base address */
+#define BCM2835_DMA_CHAN_SIZE	0x100
+#define BCM2835_DMA_CHAN(n)	((n) * BCM2835_DMA_CHAN_SIZE) /* Base address */
 #define BCM2835_DMA_CHANIO(base, n) ((base) + BCM2835_DMA_CHAN(n))
 
 /* the max dma length for different channels */
 #define MAX_DMA_LEN SZ_1G
 #define MAX_LITE_DMA_LEN (SZ_64K - 4)
 
+/* 40-bit DMA support */
+#define BCM2711_DMA40_CS	0x00
+#define BCM2711_DMA40_CB	0x04
+#define BCM2711_DMA40_DEBUG	0x0c
+#define BCM2711_DMA40_TI	0x10
+#define BCM2711_DMA40_SRC	0x14
+#define BCM2711_DMA40_SRCI	0x18
+#define BCM2711_DMA40_DEST	0x1c
+#define BCM2711_DMA40_DESTI	0x20
+#define BCM2711_DMA40_LEN	0x24
+#define BCM2711_DMA40_NEXT_CB	0x28
+#define BCM2711_DMA40_DEBUG2	0x2c
+
+#define BCM2711_DMA40_ACTIVE		BIT(0)
+#define BCM2711_DMA40_END		BIT(1)
+#define BCM2711_DMA40_INT		BIT(2)
+#define BCM2711_DMA40_DREQ		BIT(3)  /* DREQ state */
+#define BCM2711_DMA40_RD_PAUSED		BIT(4)  /* Reading is paused */
+#define BCM2711_DMA40_WR_PAUSED		BIT(5)  /* Writing is paused */
+#define BCM2711_DMA40_DREQ_PAUSED	BIT(6)  /* Is paused by DREQ flow control */
+#define BCM2711_DMA40_WAITING_FOR_WRITES BIT(7)  /* Waiting for last write */
+#define BCM2711_DMA40_ERR		BIT(10)
+#define BCM2711_DMA40_QOS(x)		(((x) & 0x1f) << 16)
+#define BCM2711_DMA40_PANIC_QOS(x)	(((x) & 0x1f) << 20)
+#define BCM2711_DMA40_WAIT_FOR_WRITES	BIT(28)
+#define BCM2711_DMA40_DISDEBUG		BIT(29)
+#define BCM2711_DMA40_ABORT		BIT(30)
+#define BCM2711_DMA40_HALT		BIT(31)
+#define BCM2711_DMA40_CS_FLAGS(x) (x & (BCM2711_DMA40_QOS(15) | \
+					BCM2711_DMA40_PANIC_QOS(15) | \
+					BCM2711_DMA40_WAIT_FOR_WRITES |	\
+					BCM2711_DMA40_DISDEBUG))
+
+/* Transfer information bits */
+#define BCM2711_DMA40_INTEN		BIT(0)
+#define BCM2711_DMA40_TDMODE		BIT(1) /* 2D-Mode */
+#define BCM2711_DMA40_WAIT_RESP		BIT(2) /* wait for AXI write to be acked */
+#define BCM2711_DMA40_WAIT_RD_RESP	BIT(3) /* wait for AXI read to complete */
+#define BCM2711_DMA40_PER_MAP(x)	((x & 31) << 9) /* REQ source */
+#define BCM2711_DMA40_S_DREQ		BIT(14) /* enable SREQ for source */
+#define BCM2711_DMA40_D_DREQ		BIT(15) /* enable DREQ for destination */
+#define BCM2711_DMA40_S_WAIT(x)		((x & 0xff) << 16) /* add DMA read-wait cycles */
+#define BCM2711_DMA40_D_WAIT(x)		((x & 0xff) << 24) /* add DMA write-wait cycles */
+
+/* debug register bits */
+#define BCM2711_DMA40_DEBUG_WRITE_ERR		BIT(0)
+#define BCM2711_DMA40_DEBUG_FIFO_ERR		BIT(1)
+#define BCM2711_DMA40_DEBUG_READ_ERR		BIT(2)
+#define BCM2711_DMA40_DEBUG_READ_CB_ERR		BIT(3)
+#define BCM2711_DMA40_DEBUG_IN_ON_ERR		BIT(8)
+#define BCM2711_DMA40_DEBUG_ABORT_ON_ERR	BIT(9)
+#define BCM2711_DMA40_DEBUG_HALT_ON_ERR		BIT(10)
+#define BCM2711_DMA40_DEBUG_DISABLE_CLK_GATE	BIT(11)
+#define BCM2711_DMA40_DEBUG_RSTATE_SHIFT	14
+#define BCM2711_DMA40_DEBUG_RSTATE_BITS		4
+#define BCM2711_DMA40_DEBUG_WSTATE_SHIFT	18
+#define BCM2711_DMA40_DEBUG_WSTATE_BITS		4
+#define BCM2711_DMA40_DEBUG_RESET		BIT(23)
+#define BCM2711_DMA40_DEBUG_ID_SHIFT		24
+#define BCM2711_DMA40_DEBUG_ID_BITS		4
+#define BCM2711_DMA40_DEBUG_VERSION_SHIFT	28
+#define BCM2711_DMA40_DEBUG_VERSION_BITS	4
+
+/* Valid only for channels 0 - 3 (11 - 14) */
+#define BCM2711_DMA40_CHAN(n)	(((n) + 11) << 8) /* Base address */
+#define BCM2711_DMA40_CHANIO(base, n) ((base) + BCM2711_DMA_CHAN(n))
+
+/* the max dma length for different channels */
+#define MAX_DMA40_LEN SZ_1G
+
+#define BCM2711_DMA40_BURST_LEN(x)	((min(x,16) - 1) << 8)
+#define BCM2711_DMA40_INC		BIT(12)
+#define BCM2711_DMA40_SIZE_32		(0 << 13)
+#define BCM2711_DMA40_SIZE_64		(1 << 13)
+#define BCM2711_DMA40_SIZE_128		(2 << 13)
+#define BCM2711_DMA40_SIZE_256		(3 << 13)
+#define BCM2711_DMA40_IGNORE		BIT(15)
+#define BCM2711_DMA40_STRIDE(x)		((x) << 16) /* For 2D mode */
+
+#define BCM2711_DMA40_MEMCPY_FLAGS \
+	(BCM2711_DMA40_QOS(0) | \
+	 BCM2711_DMA40_PANIC_QOS(0) | \
+	 BCM2711_DMA40_WAIT_FOR_WRITES | \
+	 BCM2711_DMA40_DISDEBUG)
+
+#define BCM2711_DMA40_MEMCPY_XFER_INFO \
+	(BCM2711_DMA40_SIZE_128 | \
+	 BCM2711_DMA40_INC | \
+	 BCM2711_DMA40_BURST_LEN(16))
+
+struct bcm2835_dmadev *memcpy_parent;
+static void __iomem *memcpy_chan;
+static struct bcm2711_dma40_scb *memcpy_scb;
+static dma_addr_t memcpy_scb_dma;
+DEFINE_SPINLOCK(memcpy_lock);
+
+static const struct bcm2835_dma_cfg_data bcm2835_dma_cfg = {
+	.chan_40bit_mask = 0,
+};
+
+static const struct bcm2835_dma_cfg_data bcm2711_dma_cfg = {
+	.chan_40bit_mask = BIT(11) | BIT(12) | BIT(13) | BIT(14),
+};
+
 static inline size_t bcm2835_dma_max_frame_length(struct bcm2835_chan *c)
 {
 	/* lite and normal channels have different max frame length */
@@ -209,6 +332,32 @@ static inline struct bcm2835_desc *to_bc
 	return container_of(t, struct bcm2835_desc, vd.tx);
 }
 
+static inline uint32_t to_bcm2711_ti(uint32_t info)
+{
+	return ((info & BCM2835_DMA_INT_EN) ? BCM2711_DMA40_INTEN : 0) |
+		((info & BCM2835_DMA_WAIT_RESP) ? BCM2711_DMA40_WAIT_RESP : 0) |
+		((info & BCM2835_DMA_S_DREQ) ?
+		 (BCM2711_DMA40_S_DREQ | BCM2711_DMA40_WAIT_RD_RESP) : 0) |
+		((info & BCM2835_DMA_D_DREQ) ? BCM2711_DMA40_D_DREQ : 0) |
+		BCM2711_DMA40_PER_MAP((info >> 16) & 0x1f);
+}
+
+static inline uint32_t to_bcm2711_srci(uint32_t info)
+{
+	return ((info & BCM2835_DMA_S_INC) ? BCM2711_DMA40_INC : 0);
+}
+
+static inline uint32_t to_bcm2711_dsti(uint32_t info)
+{
+	return ((info & BCM2835_DMA_D_INC) ? BCM2711_DMA40_INC : 0);
+}
+
+static inline uint32_t to_bcm2711_cbaddr(dma_addr_t addr)
+{
+	BUG_ON(addr & 0x1f);
+	return (addr >> 5);
+}
+
 static void bcm2835_dma_free_cb_chain(struct bcm2835_desc *desc)
 {
 	size_t i;
@@ -227,45 +376,53 @@ static void bcm2835_dma_desc_free(struct
 }
 
 static void bcm2835_dma_create_cb_set_length(
-	struct bcm2835_chan *chan,
+	struct bcm2835_chan *c,
 	struct bcm2835_dma_cb *control_block,
 	size_t len,
 	size_t period_len,
 	size_t *total_len,
 	u32 finalextrainfo)
 {
-	size_t max_len = bcm2835_dma_max_frame_length(chan);
+	size_t max_len = bcm2835_dma_max_frame_length(c);
+	uint32_t cb_len;
 
 	/* set the length taking lite-channel limitations into account */
-	control_block->length = min_t(u32, len, max_len);
+	cb_len = min_t(u32, len, max_len);
 
-	/* finished if we have no period_length */
-	if (!period_len)
-		return;
+	if (period_len) {
+		/*
+		 * period_len means: that we need to generate
+		 * transfers that are terminating at every
+		 * multiple of period_len - this is typically
+		 * used to set the interrupt flag in info
+		 * which is required during cyclic transfers
+		 */
 
-	/*
-	 * period_len means: that we need to generate
-	 * transfers that are terminating at every
-	 * multiple of period_len - this is typically
-	 * used to set the interrupt flag in info
-	 * which is required during cyclic transfers
-	 */
+		/* have we filled in period_length yet? */
+		if (*total_len + cb_len < period_len) {
+			/* update number of bytes in this period so far */
+			*total_len += cb_len;
+		} else {
+			/* calculate the length that remains to reach period_len */
+			cb_len = period_len - *total_len;
 
-	/* have we filled in period_length yet? */
-	if (*total_len + control_block->length < period_len) {
-		/* update number of bytes in this period so far */
-		*total_len += control_block->length;
-		return;
+			/* reset total_length for next period */
+			*total_len = 0;
+		}
 	}
 
-	/* calculate the length that remains to reach period_length */
-	control_block->length = period_len - *total_len;
-
-	/* reset total_length for next period */
-	*total_len = 0;
-
-	/* add extrainfo bits in info */
-	control_block->info |= finalextrainfo;
+	if (c->is_40bit_channel) {
+		struct bcm2711_dma40_scb *scb =
+			(struct bcm2711_dma40_scb *)control_block;
+
+		scb->len = cb_len;
+		/* add extrainfo bits to ti */
+		scb->ti |= to_bcm2711_ti(finalextrainfo);
+	} else {
+		control_block->length = cb_len;
+		/* add extrainfo bits to info */
+		control_block->info |= finalextrainfo;
+	}
 }
 
 static inline size_t bcm2835_dma_count_frames_for_sg(
@@ -288,7 +445,7 @@ static inline size_t bcm2835_dma_count_f
 /**
  * bcm2835_dma_create_cb_chain - create a control block and fills data in
  *
- * @chan:           the @dma_chan for which we run this
+ * @c:              the @bcm2835_chan for which we run this
  * @direction:      the direction in which we transfer
  * @cyclic:         it is a cyclic transfer
  * @info:           the default info bits to apply per controlblock
@@ -306,12 +463,11 @@ static inline size_t bcm2835_dma_count_f
  * @gfp:            the GFP flag to use for allocation
  */
 static struct bcm2835_desc *bcm2835_dma_create_cb_chain(
-	struct dma_chan *chan, enum dma_transfer_direction direction,
+	struct bcm2835_chan *c, enum dma_transfer_direction direction,
 	bool cyclic, u32 info, u32 finalextrainfo, size_t frames,
 	dma_addr_t src, dma_addr_t dst, size_t buf_len,
 	size_t period_len, gfp_t gfp)
 {
-	struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
 	size_t len = buf_len, total_len;
 	size_t frame;
 	struct bcm2835_desc *d;
@@ -343,11 +499,23 @@ static struct bcm2835_desc *bcm2835_dma_
 
 		/* fill in the control block */
 		control_block = cb_entry->cb;
-		control_block->info = info;
-		control_block->src = src;
-		control_block->dst = dst;
-		control_block->stride = 0;
-		control_block->next = 0;
+		if (c->is_40bit_channel) {
+			struct bcm2711_dma40_scb *scb =
+				(struct bcm2711_dma40_scb *)control_block;
+			scb->ti = to_bcm2711_ti(info);
+			scb->src = lower_32_bits(src);
+			scb->srci= upper_32_bits(src) | to_bcm2711_srci(info);
+			scb->dst = lower_32_bits(dst);
+			scb->dsti = upper_32_bits(dst) | to_bcm2711_dsti(info);
+			scb->next_cb = 0;
+		} else {
+			control_block->info = info;
+			control_block->src = src;
+			control_block->dst = dst;
+			control_block->stride = 0;
+			control_block->next = 0;
+		}
+
 		/* set up length in control_block if requested */
 		if (buf_len) {
 			/* calculate length honoring period_length */
@@ -361,7 +529,11 @@ static struct bcm2835_desc *bcm2835_dma_
 		}
 
 		/* link this the last controlblock */
-		if (frame)
+		if (frame && c->is_40bit_channel)
+			((struct bcm2711_dma40_scb *)
+			 d->cb_list[frame - 1].cb)->next_cb =
+				to_bcm2711_cbaddr(cb_entry->paddr);
+		if (frame && !c->is_40bit_channel)
 			d->cb_list[frame - 1].cb->next = cb_entry->paddr;
 
 		/* update src and dst and length */
@@ -371,11 +543,21 @@ static struct bcm2835_desc *bcm2835_dma_
 			dst += control_block->length;
 
 		/* Length of total transfer */
-		d->size += control_block->length;
+		if (c->is_40bit_channel)
+			d->size += ((struct bcm2711_dma40_scb *)control_block)->len;
+		else
+			d->size += control_block->length;
 	}
 
 	/* the last frame requires extra flags */
-	d->cb_list[d->frames - 1].cb->info |= finalextrainfo;
+	if (c->is_40bit_channel) {
+		struct bcm2711_dma40_scb *scb =
+			(struct bcm2711_dma40_scb *)d->cb_list[d->frames-1].cb;
+
+		scb->ti |= to_bcm2711_ti(finalextrainfo);
+	} else {
+		d->cb_list[d->frames - 1].cb->info |= finalextrainfo;
+	}
 
 	/* detect a size missmatch */
 	if (buf_len && (d->size != buf_len))
@@ -389,13 +571,12 @@ error_cb:
 }
 
 static void bcm2835_dma_fill_cb_chain_with_sg(
-	struct dma_chan *chan,
+	struct bcm2835_chan *c,
 	enum dma_transfer_direction direction,
 	struct bcm2835_cb_entry *cb,
 	struct scatterlist *sgl,
 	unsigned int sg_len)
 {
-	struct bcm2835_chan *c = to_bcm2835_dma_chan(chan);
 	size_t len, max_len;
 	unsigned int i;
 	dma_addr_t addr;
@@ -403,14 +584,35 @@ static void bcm2835_dma_fill_cb_chain_wi
 
 	max_len = bcm2835_dma_max_frame_length(c);
 	for_each_sg(sgl, sgent, sg_len, i) {
-		for (addr = sg_dma_address(sgent), len = sg_dma_len(sgent);
-		     len > 0;
-		     addr += cb->cb->length, len -= cb->cb->length, cb++) {
-			if (direction == DMA_DEV_TO_MEM)
-				cb->cb->dst = addr;
-			else
-				cb->cb->src = addr;
-			cb->cb->length = min(len, max_len);
+		if (c->is_40bit_channel) {
+			struct bcm2711_dma40_scb *scb;
+
+			for (addr = sg_dma_address(sgent),
+				     len = sg_dma_len(sgent);
+				     len > 0;
+			     addr += scb->len, len -= scb->len, cb++) {
+				scb = (struct bcm2711_dma40_scb *)cb->cb;
+				if (direction == DMA_DEV_TO_MEM) {
+					scb->dst = lower_32_bits(addr);
+					scb->dsti = upper_32_bits(addr) | BCM2711_DMA40_INC;
+				} else {
+					scb->src = lower_32_bits(addr);
+					scb->srci = upper_32_bits(addr) | BCM2711_DMA40_INC;
+				}
+				scb->len = min(len, max_len);
+			}
+		} else {
+			for (addr = sg_dma_address(sgent),
+				     len = sg_dma_len(sgent);
+			     len > 0;
+			     addr += cb->cb->length, len -= cb->cb->length,
+			     cb++) {
+				if (direction == DMA_DEV_TO_MEM)
+					cb->cb->dst = addr;
+				else
+					cb->cb->src = addr;
+				cb->cb->length = min(len, max_len);
+			}
 		}
 	}
 }
@@ -419,6 +621,10 @@ static void bcm2835_dma_abort(struct bcm
 {
 	void __iomem *chan_base = c->chan_base;
 	long int timeout = 10000;
+	u32 wait_mask = BCM2835_DMA_WAITING_FOR_WRITES;
+
+	if (c->is_40bit_channel)
+		wait_mask = BCM2711_DMA40_WAITING_FOR_WRITES;
 
 	/*
 	 * A zero control block address means the channel is idle.
@@ -431,8 +637,7 @@ static void bcm2835_dma_abort(struct bcm
 	writel(0, chan_base + BCM2835_DMA_CS);
 
 	/* Wait for any current AXI transfer to complete */
-	while ((readl(chan_base + BCM2835_DMA_CS) &
-		BCM2835_DMA_WAITING_FOR_WRITES) && --timeout)
+	while ((readl(chan_base + BCM2835_DMA_CS) & wait_mask) && --timeout)
 		cpu_relax();
 
 	/* Peripheral might be stuck and fail to signal AXI write responses */
@@ -457,9 +662,16 @@ static void bcm2835_dma_start_desc(struc
 
 	c->desc = d = to_bcm2835_dma_desc(&vd->tx);
 
-	writel(d->cb_list[0].paddr, c->chan_base + BCM2835_DMA_ADDR);
-	writel(BCM2835_DMA_ACTIVE | BCM2835_DMA_CS_FLAGS(c->dreq),
-	       c->chan_base + BCM2835_DMA_CS);
+	if (c->is_40bit_channel) {
+		writel(to_bcm2711_cbaddr(d->cb_list[0].paddr),
+		       c->chan_base + BCM2711_DMA40_CB);
+		writel(BCM2711_DMA40_ACTIVE | BCM2711_DMA40_CS_FLAGS(c->dreq),
+		       c->chan_base + BCM2711_DMA40_CS);
+	} else {
+		writel(d->cb_list[0].paddr, c->chan_base + BCM2835_DMA_ADDR);
+		writel(BCM2835_DMA_ACTIVE | BCM2835_DMA_CS_FLAGS(c->dreq),
+		       c->chan_base + BCM2835_DMA_CS);
+	}
 }
 
 static irqreturn_t bcm2835_dma_callback(int irq, void *data)
@@ -486,8 +698,7 @@ static irqreturn_t bcm2835_dma_callback(
 	 * if this IRQ handler is threaded.) If the channel is finished, it
 	 * will remain idle despite the ACTIVE flag being set.
 	 */
-	writel(BCM2835_DMA_INT | BCM2835_DMA_ACTIVE |
-	       BCM2835_DMA_CS_FLAGS(c->dreq),
+	writel(BCM2835_DMA_INT | BCM2835_DMA_ACTIVE,
 	       c->chan_base + BCM2835_DMA_CS);
 
 	d = c->desc;
@@ -590,9 +801,17 @@ static enum dma_status bcm2835_dma_tx_st
 		struct bcm2835_desc *d = c->desc;
 		dma_addr_t pos;
 
-		if (d->dir == DMA_MEM_TO_DEV)
+		if (d->dir == DMA_MEM_TO_DEV && c->is_40bit_channel)
+			pos = readl(c->chan_base + BCM2711_DMA40_SRC) +
+				((readl(c->chan_base + BCM2711_DMA40_SRCI) &
+				  0xff) << 8);
+		else if (d->dir == DMA_MEM_TO_DEV && !c->is_40bit_channel)
 			pos = readl(c->chan_base + BCM2835_DMA_SOURCE_AD);
-		else if (d->dir == DMA_DEV_TO_MEM)
+		else if (d->dir == DMA_DEV_TO_MEM && c->is_40bit_channel)
+			pos = readl(c->chan_base + BCM2711_DMA40_DEST) +
+				((readl(c->chan_base + BCM2711_DMA40_DESTI) &
+				  0xff) << 8);
+		else if (d->dir == DMA_DEV_TO_MEM && !c->is_40bit_channel)
 			pos = readl(c->chan_base + BCM2835_DMA_DEST_AD);
 		else
 			pos = 0;
@@ -638,7 +857,7 @@ static struct dma_async_tx_descriptor *b
 	frames = bcm2835_dma_frames_for_length(len, max_len);
 
 	/* allocate the CB chain - this also fills in the pointers */
-	d = bcm2835_dma_create_cb_chain(chan, DMA_MEM_TO_MEM, false,
+	d = bcm2835_dma_create_cb_chain(c, DMA_MEM_TO_MEM, false,
 					info, extra, frames,
 					src, dst, len, 0, GFP_KERNEL);
 	if (!d)
@@ -673,11 +892,21 @@ static struct dma_async_tx_descriptor *b
 		if (c->cfg.src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
 			return NULL;
 		src = c->cfg.src_addr;
+		/*
+		 * One would think it ought to be possible to get the physical
+		 * to dma address mapping information from the dma-ranges DT
+		 * property, but I've not found a way yet that doesn't involve
+		 * open-coding the whole thing.
+		 */
+		if (c->is_40bit_channel)
+		    src |= 0x400000000ull;
 		info |= BCM2835_DMA_S_DREQ | BCM2835_DMA_D_INC;
 	} else {
 		if (c->cfg.dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
 			return NULL;
 		dst = c->cfg.dst_addr;
+		if (c->is_40bit_channel)
+		    dst |= 0x400000000ull;
 		info |= BCM2835_DMA_D_DREQ | BCM2835_DMA_S_INC;
 	}
 
@@ -685,7 +914,7 @@ static struct dma_async_tx_descriptor *b
 	frames = bcm2835_dma_count_frames_for_sg(c, sgl, sg_len);
 
 	/* allocate the CB chain */
-	d = bcm2835_dma_create_cb_chain(chan, direction, false,
+	d = bcm2835_dma_create_cb_chain(c, direction, false,
 					info, extra,
 					frames, src, dst, 0, 0,
 					GFP_NOWAIT);
@@ -693,7 +922,7 @@ static struct dma_async_tx_descriptor *b
 		return NULL;
 
 	/* fill in frames with scatterlist pointers */
-	bcm2835_dma_fill_cb_chain_with_sg(chan, direction, d->cb_list,
+	bcm2835_dma_fill_cb_chain_with_sg(c, direction, d->cb_list,
 					  sgl, sg_len);
 
 	return vchan_tx_prep(&c->vc, &d->vd, flags);
@@ -747,12 +976,16 @@ static struct dma_async_tx_descriptor *b
 		if (c->cfg.src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
 			return NULL;
 		src = c->cfg.src_addr;
+		if (c->is_40bit_channel)
+		    src |= 0x400000000ull;
 		dst = buf_addr;
 		info |= BCM2835_DMA_S_DREQ | BCM2835_DMA_D_INC;
 	} else {
 		if (c->cfg.dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES)
 			return NULL;
 		dst = c->cfg.dst_addr;
+		if (c->is_40bit_channel)
+		    dst |= 0x400000000ull;
 		src = buf_addr;
 		info |= BCM2835_DMA_D_DREQ | BCM2835_DMA_S_INC;
 
@@ -772,7 +1005,7 @@ static struct dma_async_tx_descriptor *b
 	 * note that we need to use GFP_NOWAIT, as the ALSA i2s dmaengine
 	 * implementation calls prep_dma_cyclic with interrupts disabled.
 	 */
-	d = bcm2835_dma_create_cb_chain(chan, direction, true,
+	d = bcm2835_dma_create_cb_chain(c, direction, true,
 					info, extra,
 					frames, src, dst, buf_len,
 					period_len, GFP_NOWAIT);
@@ -780,7 +1013,12 @@ static struct dma_async_tx_descriptor *b
 		return NULL;
 
 	/* wrap around into a loop */
-	d->cb_list[d->frames - 1].cb->next = d->cb_list[0].paddr;
+	if (c->is_40bit_channel)
+		((struct bcm2711_dma40_scb *)
+		 d->cb_list[frames - 1].cb)->next_cb =
+			to_bcm2711_cbaddr(d->cb_list[0].paddr);
+	else
+		d->cb_list[d->frames - 1].cb->next = d->cb_list[0].paddr;
 
 	return vchan_tx_prep(&c->vc, &d->vd, flags);
 }
@@ -844,9 +1082,11 @@ static int bcm2835_dma_chan_init(struct
 	c->irq_number = irq;
 	c->irq_flags = irq_flags;
 
-	/* check in DEBUG register if this is a LITE channel */
-	if (readl(c->chan_base + BCM2835_DMA_DEBUG) &
-		BCM2835_DMA_DEBUG_LITE)
+	/* check for 40bit and lite channels */
+	if (d->cfg_data->chan_40bit_mask & BIT(chan_id))
+		c->is_40bit_channel = true;
+	else if (readl(c->chan_base + BCM2835_DMA_DEBUG) &
+		 BCM2835_DMA_DEBUG_LITE)
 		c->is_lite_channel = true;
 
 	return 0;
@@ -866,8 +1106,58 @@ static void bcm2835_dma_free(struct bcm2
 			     DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 }
 
+int bcm2711_dma40_memcpy_init(void)
+{
+	if (!memcpy_parent)
+		return -EPROBE_DEFER;
+
+	if (!memcpy_chan)
+		return -EINVAL;
+
+	if (!memcpy_scb)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(bcm2711_dma40_memcpy_init);
+
+void bcm2711_dma40_memcpy(dma_addr_t dst, dma_addr_t src, size_t size)
+{
+	struct bcm2711_dma40_scb *scb = memcpy_scb;
+	unsigned long flags;
+
+	if (!scb) {
+		pr_err("bcm2711_dma40_memcpy not initialised!\n");
+		return;
+	}
+
+	spin_lock_irqsave(&memcpy_lock, flags);
+
+	scb->ti = 0;
+	scb->src = lower_32_bits(src);
+	scb->srci = upper_32_bits(src) | BCM2711_DMA40_MEMCPY_XFER_INFO;
+	scb->dst = lower_32_bits(dst);
+	scb->dsti = upper_32_bits(dst) | BCM2711_DMA40_MEMCPY_XFER_INFO;
+	scb->len = size;
+	scb->next_cb = 0;
+
+	writel((u32)(memcpy_scb_dma >> 5), memcpy_chan + BCM2711_DMA40_CB);
+	writel(BCM2711_DMA40_MEMCPY_FLAGS + BCM2711_DMA40_ACTIVE,
+	       memcpy_chan + BCM2711_DMA40_CS);
+
+	/* Poll for completion */
+	while (!(readl(memcpy_chan + BCM2711_DMA40_CS) & BCM2711_DMA40_END))
+		cpu_relax();
+
+	writel(BCM2711_DMA40_END, memcpy_chan + BCM2711_DMA40_CS);
+
+	spin_unlock_irqrestore(&memcpy_lock, flags);
+}
+EXPORT_SYMBOL(bcm2711_dma40_memcpy);
+
 static const struct of_device_id bcm2835_dma_of_match[] = {
-	{ .compatible = "brcm,bcm2835-dma", },
+	{ .compatible = "brcm,bcm2835-dma", .data = &bcm2835_dma_cfg },
+	{ .compatible = "brcm,bcm2711-dma", .data = &bcm2711_dma_cfg },
 	{},
 };
 MODULE_DEVICE_TABLE(of, bcm2835_dma_of_match);
@@ -899,6 +1189,8 @@ static int bcm2835_dma_probe(struct plat
 	int irq_flags;
 	uint32_t chans_available;
 	char chan_name[BCM2835_DMA_CHAN_NAME_SIZE];
+	const struct of_device_id *of_id;
+	int chan_count, chan_start, chan_end;
 
 	if (!pdev->dev.dma_mask)
 		pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
@@ -920,9 +1212,13 @@ static int bcm2835_dma_probe(struct plat
 	base = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(base))
 		return PTR_ERR(base);
-	rc = bcm_dmaman_probe(pdev, base, BCM2835_DMA_BULK_MASK);
-	if (rc)
-		dev_err(&pdev->dev, "Failed to initialize the legacy API\n");
+
+	/* The set of channels can be split across multiple instances. */
+	chan_start = ((u32)(uintptr_t)base / BCM2835_DMA_CHAN_SIZE) & 0xf;
+	base -= BCM2835_DMA_CHAN(chan_start);
+	chan_count = resource_size(res) / BCM2835_DMA_CHAN_SIZE;
+	chan_end = min(chan_start + chan_count,
+			 BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED + 1);
 
 	od->base = base;
 
@@ -959,6 +1255,14 @@ static int bcm2835_dma_probe(struct plat
 		return -ENOMEM;
 	}
 
+	of_id = of_match_node(bcm2835_dma_of_match, pdev->dev.of_node);
+	if (!of_id) {
+		dev_err(&pdev->dev, "Failed to match compatible string\n");
+		return -EINVAL;
+	}
+
+	od->cfg_data = of_id->data;
+
 	/* Request DMA channel mask from device tree */
 	if (of_property_read_u32(pdev->dev.of_node,
 			"brcm,dma-channel-mask",
@@ -968,11 +1272,34 @@ static int bcm2835_dma_probe(struct plat
 		goto err_no_dma;
 	}
 
-	/* Channel 0 is used by the legacy API */
-	chans_available &= ~BCM2835_DMA_BULK_MASK;
+	/* One channel is reserved for the legacy API */
+	if (chans_available & BCM2835_DMA_BULK_MASK) {
+		rc = bcm_dmaman_probe(pdev, base,
+				      chans_available & BCM2835_DMA_BULK_MASK);
+		if (rc)
+			dev_err(&pdev->dev,
+				"Failed to initialize the legacy API\n");
+
+		chans_available &= ~BCM2835_DMA_BULK_MASK;
+	}
+
+	/* And possibly one for the 40-bit DMA memcpy API */
+	if (chans_available & od->cfg_data->chan_40bit_mask &
+	    BIT(BCM2711_DMA_MEMCPY_CHAN)) {
+		memcpy_parent = od;
+		memcpy_chan = BCM2835_DMA_CHANIO(base, BCM2711_DMA_MEMCPY_CHAN);
+		memcpy_scb = dma_alloc_coherent(memcpy_parent->ddev.dev,
+						sizeof(*memcpy_scb),
+						&memcpy_scb_dma, GFP_KERNEL);
+		if (!memcpy_scb)
+			dev_warn(&pdev->dev,
+				 "Failed to allocated memcpy scb\n");
+
+		chans_available &= ~BIT(BCM2711_DMA_MEMCPY_CHAN);
+	}
 
 	/* get irqs for each channel that we support */
-	for (i = 0; i <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; i++) {
+	for (i = chan_start; i < chan_end; i++) {
 		/* skip masked out channels */
 		if (!(chans_available & (1 << i))) {
 			irq[i] = -1;
@@ -995,13 +1322,17 @@ static int bcm2835_dma_probe(struct plat
 		irq[i] = platform_get_irq(pdev, i < 11 ? i : 11);
 	}
 
+	chan_count = 0;
+
 	/* get irqs for each channel */
-	for (i = 0; i <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; i++) {
+	for (i = chan_start; i < chan_end; i++) {
 		/* skip channels without irq */
 		if (irq[i] < 0)
 			continue;
 
 		/* check if there are other channels that also use this irq */
+		/* FIXME: This will fail if interrupts are shared across
+		   instances */
 		irq_flags = 0;
 		for (j = 0; j <= BCM2835_DMA_MAX_DMA_CHAN_SUPPORTED; j++)
 			if ((i != j) && (irq[j] == irq[i])) {
@@ -1013,9 +1344,10 @@ static int bcm2835_dma_probe(struct plat
 		rc = bcm2835_dma_chan_init(od, i, irq[i], irq_flags);
 		if (rc)
 			goto err_no_dma;
+		chan_count++;
 	}
 
-	dev_dbg(&pdev->dev, "Initialized %i DMA channels\n", i);
+	dev_dbg(&pdev->dev, "Initialized %i DMA channels\n", chan_count);
 
 	/* Device-tree DMA controller registration */
 	rc = of_dma_controller_register(pdev->dev.of_node,
@@ -1047,6 +1379,13 @@ static int bcm2835_dma_remove(struct pla
 
 	bcm_dmaman_remove(pdev);
 	dma_async_device_unregister(&od->ddev);
+	if (memcpy_parent == od) {
+		dma_free_coherent(&pdev->dev, sizeof(*memcpy_scb), memcpy_scb,
+				  memcpy_scb_dma);
+		memcpy_parent = NULL;
+		memcpy_scb = NULL;
+		memcpy_chan = NULL;
+	}
 	bcm2835_dma_free(od);
 
 	return 0;