22 files changed, 1161 insertions, 209 deletions
diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index a09f4a0d1..fd98ab4bd 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -3297,7 +3297,7 @@ void RTLIL::SigSpec::replace(int offset, const RTLIL::SigSpec &with)
 	check();
 }
 
-void RTLIL::SigSpec::remove_const()
+RTLIL::SigSpec& RTLIL::SigSpec::remove_const()
 {
 	if (packed())
 	{
@@ -3331,6 +3331,7 @@ void RTLIL::SigSpec::remove_const()
 	}
 
 	check();
+	return *this;
 }
 
 void RTLIL::SigSpec::remove(int offset, int length)
@@ -3353,7 +3354,13 @@ RTLIL::SigSpec RTLIL::SigSpec::extract(int offset, int length) const
 {
 	unpack();
 	cover("kernel.rtlil.sigspec.extract_pos");
-	return std::vector<RTLIL::SigBit>(bits_.begin() + offset, bits_.begin() + offset + length);
+	auto it = bits_.begin() + std::min<int>(offset, width_);
+	decltype(it) ie;
+	if (length >= 0)
+		ie = bits_.begin() + std::min<int>(offset + length, width_);
+	else
+		ie = bits_.end() + std::max<int>(length + 1, offset - width_);
+	return std::vector<RTLIL::SigBit>(it, ie);
 }
 
 void RTLIL::SigSpec::append(const RTLIL::SigSpec &signal)
@@ -3426,7 +3433,7 @@ void RTLIL::SigSpec::append_bit(const RTLIL::SigBit &bit)
 	check();
 }
 
-void RTLIL::SigSpec::extend_u0(int width, bool is_signed)
+RTLIL::SigSpec& RTLIL::SigSpec::extend_u0(int width, bool is_signed)
 {
 	cover("kernel.rtlil.sigspec.extend_u0");
 
@@ -3443,6 +3450,7 @@ void RTLIL::SigSpec::extend_u0(int width, bool is_signed)
 			append(padding);
 	}
 
+	return *this;
 }
 
 RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index 712250b3e..02bf274fb 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -750,8 +750,8 @@ public:
 	inline int size() const { return width_; }
 	inline bool empty() const { return width_ == 0; }
 
-	inline RTLIL::SigBit &operator[](int index) { inline_unpack(); return bits_.at(index); }
-	inline const RTLIL::SigBit &operator[](int index) const { inline_unpack(); return bits_.at(index); }
+	inline RTLIL::SigBit &operator[](int index) { inline_unpack(); return index >= 0 ? bits_.at(index) : bits_.at(width_ + index); }
+	inline const RTLIL::SigBit &operator[](int index) const { inline_unpack(); return index >= 0 ? bits_.at(index) : bits_.at(width_ + index); }
 
 	inline RTLIL::SigSpecIterator begin() { RTLIL::SigSpecIterator it; it.sig_p = this; it.index = 0; return it; }
 	inline RTLIL::SigSpecIterator end() { RTLIL::SigSpecIterator it; it.sig_p = this; it.index = width_; return it; }
@@ -783,7 +783,7 @@ public:
 	void remove2(const std::set<RTLIL::SigBit> &pattern, RTLIL::SigSpec *other);
 
 	void remove(int offset, int length = 1);
-	void remove_const();
+	RTLIL::SigSpec& remove_const();
 
 	RTLIL::SigSpec extract(const RTLIL::SigSpec &pattern, const RTLIL::SigSpec *other = NULL) const;
 	RTLIL::SigSpec extract(const pool<RTLIL::SigBit> &pattern, const RTLIL::SigSpec *other = NULL) const;
@@ -792,7 +792,7 @@ public:
 	void append(const RTLIL::SigSpec &signal);
 	void append_bit(const RTLIL::SigBit &bit);
 
-	void extend_u0(int width, bool is_signed = false);
+	RTLIL::SigSpec& extend_u0(int width, bool is_signed = false);
 
 	RTLIL::SigSpec repeat(int num) const;
 
@@ -834,6 +834,7 @@ public:
 
 	operator std::vector<RTLIL::SigChunk>() const { return chunks(); }
 	operator std::vector<RTLIL::SigBit>() const { return bits(); }
+	RTLIL::SigBit at(int offset, const RTLIL::SigBit &defval) { return offset < width_ ? (*this)[offset] : defval; }
 
 	unsigned int hash() const { if (!hash_) updhash(); return hash_; };
 
diff --git a/passes/opt/wreduce.cc b/passes/opt/wreduce.cc
index 1fbc41082..908a85d5b 100644
--- a/passes/opt/wreduce.cc
+++ b/passes/opt/wreduce.cc
@@ -342,9 +342,9 @@ struct WreduceWorker
 			}
 		}
 
-		if (cell->type.in("$pos", "$add", "$mul", "$and", "$or", "$xor"))
+		if (cell->type.in("$pos", "$add", "$mul", "$and", "$or", "$xor", "$sub"))
 		{
-			bool is_signed = cell->getParam("\\A_SIGNED").as_bool();
+			bool is_signed = cell->getParam("\\A_SIGNED").as_bool() || cell->type == "$sub";
 
 			int a_size = 0, b_size = 0;
 			if (cell->hasPort("\\A")) a_size = GetSize(cell->getPort("\\A"));
@@ -352,7 +352,7 @@ struct WreduceWorker
 
 			int max_y_size = max(a_size, b_size);
 
-			if (cell->type == "$add")
+			if (cell->type.in("$add", "$sub"))
 				max_y_size++;
 
 			if (cell->type == "$mul")
@@ -365,6 +365,29 @@ struct WreduceWorker
 			}
 		}
 
+		if (cell->type.in("$add", "$sub")) {
+			SigSpec A = mi.sigmap(cell->getPort("\\A"));
+			SigSpec B = mi.sigmap(cell->getPort("\\B"));
+			bool sub = cell->type == "$sub";
+
+			int i;
+			for (i = 0; i < GetSize(sig); i++) {
+				if (B.at(i, Sx) != S0 && (sub || A.at(i, Sx) != S0))
+					break;
+				if (B[i] == S0)
+					module->connect(sig[i], A[i]);
+				else if (A[i] == S0)
+					module->connect(sig[i], B[i]);
+				else log_abort();
+			}
+			if (i > 0) {
+				cell->setPort("\\A", A.extract(i, -1));
+				cell->setPort("\\B", B.extract(i, -1));
+				sig.remove(0, i);
+				bits_removed += i;
+			}
+		}
+
 		if (GetSize(sig) == 0) {
 			log("Removed cell %s.%s (%s).\n", log_id(module), log_id(cell), log_id(cell->type));
 			module->remove(cell);
@@ -372,7 +395,7 @@ struct WreduceWorker
 		}
 
 		if (bits_removed) {
-			log("Removed top %d bits (of %d) from port Y of cell %s.%s (%s).\n",
+			log("Removed %d bits (of %d) from port Y of cell %s.%s (%s).\n",
 					bits_removed, GetSize(sig) + bits_removed, log_id(module), log_id(cell), log_id(cell->type));
 			cell->setPort("\\Y", sig);
 			did_something = true;
diff --git a/passes/pmgen/.gitignore b/passes/pmgen/.gitignore
index 0ad36ea2c..e52f3282f 100644
--- a/passes/pmgen/.gitignore
+++ b/passes/pmgen/.gitignore
@@ -1,2 +1 @@
-/ice40_dsp_pm.h
-/peepopt_pm.h
+/*_pm.h
diff --git a/passes/pmgen/Makefile.inc b/passes/pmgen/Makefile.inc
index 7911132db..e33866670 100644
--- a/passes/pmgen/Makefile.inc
+++ b/passes/pmgen/Makefile.inc
@@ -1,14 +1,19 @@
 OBJS += passes/pmgen/ice40_dsp.o
+OBJS += passes/pmgen/xilinx_dsp.o
 OBJS += passes/pmgen/peepopt.o
 
 # --------------------------------------
 
+passes/pmgen/%.o: passes/pmgen/%_pm.h
 passes/pmgen/ice40_dsp.o: passes/pmgen/ice40_dsp_pm.h
+passes/pmgen/xilinx_dsp.o: passes/pmgen/xilinx_dsp_pm.h
 EXTRA_OBJS += passes/pmgen/ice40_dsp_pm.h
+EXTRA_OBJS += passes/pmgen/xilinx_dsp_pm.h
 .SECONDARY: passes/pmgen/ice40_dsp_pm.h
+.SECONDARY: passes/pmgen/xilinx_dsp_pm.h
 
-passes/pmgen/ice40_dsp_pm.h: passes/pmgen/pmgen.py passes/pmgen/ice40_dsp.pmg
-	$(P) mkdir -p passes/pmgen && python3 $< -o $@ -p ice40_dsp $(filter-out $<,$^)
+passes/pmgen/%_pm.h: passes/pmgen/pmgen.py passes/pmgen/%.pmg
+	$(P) mkdir -p passes/pmgen && python3 $< -o $@ -p $* $(filter-out $<,$^)
 
 # --------------------------------------
 
diff --git a/passes/pmgen/ice40_dsp.cc b/passes/pmgen/ice40_dsp.cc
index 39d033a04..f88cd62dd 100644
--- a/passes/pmgen/ice40_dsp.cc
+++ b/passes/pmgen/ice40_dsp.cc
@@ -23,6 +23,9 @@
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
 
+template<class T> bool includes(const T &lhs, const T &rhs) {
+	return std::includes(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
 #include "passes/pmgen/ice40_dsp_pm.h"
 
 void create_ice40_dsp(ice40_dsp_pm &pm)
@@ -31,13 +34,14 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 
 #if 0
 	log("\n");
-	log("ffA:   %s\n", log_id(st.ffA, "--"));
-	log("ffB:   %s\n", log_id(st.ffB, "--"));
-	log("mul:   %s\n", log_id(st.mul, "--"));
-	log("ffY:   %s\n", log_id(st.ffY, "--"));
-	log("addAB: %s\n", log_id(st.addAB, "--"));
-	log("muxAB: %s\n", log_id(st.muxAB, "--"));
-	log("ffS:   %s\n", log_id(st.ffS, "--"));
+	log("ffA:    %s\n", log_id(st.ffA, "--"));
+	log("ffB:    %s\n", log_id(st.ffB, "--"));
+	log("mul:    %s\n", log_id(st.mul, "--"));
+	log("ffH:    %s\n", log_id(st.ffH, "--"));
+	log("addAB:  %s\n", log_id(st.addAB, "--"));
+	log("muxAB:  %s\n", log_id(st.muxAB, "--"));
+	log("ffO_lo: %s\n", log_id(st.ffO_lo, "--"));
+	log("ffO_hi: %s\n", log_id(st.ffO_hi, "--"));
 #endif
 
 	log("Checking %s.%s for iCE40 DSP inference.\n", log_id(pm.module), log_id(st.mul));
@@ -52,47 +56,38 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 		return;
 	}
 
-	if (GetSize(st.sigS) > 32) {
-		log("  accumulator (%s) is too large (%d > 32).\n", log_signal(st.sigS), GetSize(st.sigS));
-		return;
-	}
-
-	if (GetSize(st.sigY) > 32) {
-		log("  output (%s) is too large (%d > 32).\n", log_signal(st.sigY), GetSize(st.sigY));
+	if (GetSize(st.sigO) > 33) {
+		log("  adder/accumulator (%s) is too large (%d > 33).\n", log_signal(st.sigO), GetSize(st.sigO));
 		return;
 	}
 
-	bool mul_signed = st.mul->getParam("\\A_SIGNED").as_bool();
-
-	if (mul_signed) {
-		log("  inference of signed iCE40 DSP arithmetic is currently not supported.\n");
+	if (GetSize(st.sigH) > 32) {
+		log("  output (%s) is too large (%d > 32).\n", log_signal(st.sigH), GetSize(st.sigH));
 		return;
 	}
 
-	log("  replacing $mul with SB_MAC16 cell.\n");
+	log("  replacing %s with SB_MAC16 cell.\n", log_id(st.mul->type));
 
 	Cell *cell = pm.module->addCell(NEW_ID, "\\SB_MAC16");
 	pm.module->swap_names(cell, st.mul);
 
 	// SB_MAC16 Input Interface
+	bool a_signed = st.mul->getParam("\\A_SIGNED").as_bool();
+	bool b_signed = st.mul->getParam("\\B_SIGNED").as_bool();
 
 	SigSpec A = st.sigA;
-	A.extend_u0(16, mul_signed);
+	A.extend_u0(16, a_signed);
 
 	SigSpec B = st.sigB;
-	B.extend_u0(16, mul_signed);
+	B.extend_u0(16, b_signed);
 
-	SigSpec CD;
-	if (st.muxA)
-		CD = st.muxA->getPort("\\B");
-	if (st.muxB)
-		CD = st.muxB->getPort("\\A");
-	CD.extend_u0(32, mul_signed);
+	SigSpec CD = st.sigCD;
+	CD.extend_u0(32, st.sigCD_signed);
 
 	cell->setPort("\\A", A);
 	cell->setPort("\\B", B);
-	cell->setPort("\\C", CD.extract(0, 16));
-	cell->setPort("\\D", CD.extract(16, 16));
+	cell->setPort("\\C", CD.extract(16, 16));
+	cell->setPort("\\D", CD.extract(0, 16));
 
 	cell->setParam("\\A_REG", st.ffA ? State::S1 : State::S0);
 	cell->setParam("\\B_REG", st.ffB ? State::S1 : State::S0);
@@ -105,7 +100,7 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 	cell->setPort("\\IRSTTOP", State::S0);
 	cell->setPort("\\IRSTBOT", State::S0);
 
-	if (st.clock_vld)
+	if (st.clock != SigBit())
 	{
 		cell->setPort("\\CLK", st.clock);
 		cell->setPort("\\CE", State::S1);
@@ -119,11 +114,13 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 		if (st.ffB)
 			log(" ffB:%s", log_id(st.ffB));
 
-		if (st.ffY)
-			log(" ffY:%s", log_id(st.ffY));
+		if (st.ffH)
+			log(" ffH:%s", log_id(st.ffH));
 
-		if (st.ffS)
-			log(" ffS:%s", log_id(st.ffS));
+		if (st.ffO_lo)
+			log(" ffO_lo:%s", log_id(st.ffO_lo));
+		if (st.ffO_hi)
+			log(" ffO_hi:%s", log_id(st.ffO_hi));
 
 		log("\n");
 	}
@@ -140,21 +137,43 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 	cell->setPort("\\SIGNEXTOUT", pm.module->addWire(NEW_ID));
 
 	cell->setPort("\\CI", State::Sx);
-	cell->setPort("\\CO", pm.module->addWire(NEW_ID));
 
 	cell->setPort("\\ACCUMCI", State::Sx);
 	cell->setPort("\\ACCUMCO", pm.module->addWire(NEW_ID));
 
 	// SB_MAC16 Output Interface
 
-	SigSpec O = st.ffS ? st.sigS : st.sigY;
+	SigSpec O = st.sigO;
+	int O_width = GetSize(O);
+	if (O_width == 33) {
+		log_assert(st.addAB);
+		// If we have a signed multiply-add, then perform sign extension
+		// TODO: Need to check CD[31:16] is sign extension of CD[15:0]?
+		if (st.addAB->getParam("\\A_SIGNED").as_bool() && st.addAB->getParam("\\B_SIGNED").as_bool())
+			pm.module->connect(O[-1], O[-2]);
+		else
+			cell->setPort("\\CO", O[-1]);
+		O.remove(O_width-1);
+	}
+	else
+		cell->setPort("\\CO", pm.module->addWire(NEW_ID));
+	log_assert(GetSize(O) <= 32);
 	if (GetSize(O) < 32)
 		O.append(pm.module->addWire(NEW_ID, 32-GetSize(O)));
 
 	cell->setPort("\\O", O);
 
+	bool accum = false;
 	if (st.addAB) {
-		log("  accumulator %s (%s)\n", log_id(st.addAB), log_id(st.addAB->type));
+		if (st.addA)
+			accum = (st.ffO_lo && st.ffO_hi && st.addAB->getPort("\\B") == st.sigO);
+		else if (st.addB)
+			accum = (st.ffO_lo && st.ffO_hi && st.addAB->getPort("\\A") == st.sigO);
+		else log_abort();
+		if (accum)
+			log("  accumulator %s (%s)\n", log_id(st.addAB), log_id(st.addAB->type));
+		else
+			log("  adder %s (%s)\n", log_id(st.addAB), log_id(st.addAB->type));
 		cell->setPort("\\ADDSUBTOP", st.addAB->type == "$add" ? State::S0 : State::S1);
 		cell->setPort("\\ADDSUBBOT", st.addAB->type == "$add" ? State::S0 : State::S1);
 	} else {
@@ -182,28 +201,36 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 	cell->setParam("\\C_REG", State::S0);
 	cell->setParam("\\D_REG", State::S0);
 
-	cell->setParam("\\TOP_8x8_MULT_REG", st.ffY ? State::S1 : State::S0);
-	cell->setParam("\\BOT_8x8_MULT_REG", st.ffY ? State::S1 : State::S0);
-	cell->setParam("\\PIPELINE_16x16_MULT_REG1", st.ffY ? State::S1 : State::S0);
+	cell->setParam("\\TOP_8x8_MULT_REG", st.ffH ? State::S1 : State::S0);
+	cell->setParam("\\BOT_8x8_MULT_REG", st.ffH ? State::S1 : State::S0);
+	cell->setParam("\\PIPELINE_16x16_MULT_REG1", st.ffH ? State::S1 : State::S0);
 	cell->setParam("\\PIPELINE_16x16_MULT_REG2", State::S0);
 
-	cell->setParam("\\TOPOUTPUT_SELECT", Const(st.ffS ? 1 : 3, 2));
+	cell->setParam("\\TOPOUTPUT_SELECT", Const(st.ffO_hi ? 1 : (st.addAB ? 0 : 3), 2));
 	cell->setParam("\\TOPADDSUB_LOWERINPUT", Const(2, 2));
-	cell->setParam("\\TOPADDSUB_UPPERINPUT", State::S0);
+	cell->setParam("\\TOPADDSUB_UPPERINPUT", accum ? State::S0 : State::S1);
 	cell->setParam("\\TOPADDSUB_CARRYSELECT", Const(3, 2));
 
-	cell->setParam("\\BOTOUTPUT_SELECT", Const(st.ffS ? 1 : 3, 2));
+	cell->setParam("\\BOTOUTPUT_SELECT", Const(st.ffO_lo ? 1 : (st.addAB ? 0 : 3), 2));
 	cell->setParam("\\BOTADDSUB_LOWERINPUT", Const(2, 2));
-	cell->setParam("\\BOTADDSUB_UPPERINPUT", State::S0);
+	cell->setParam("\\BOTADDSUB_UPPERINPUT", accum ? State::S0 : State::S1);
 	cell->setParam("\\BOTADDSUB_CARRYSELECT", Const(0, 2));
 
 	cell->setParam("\\MODE_8x8", State::S0);
-	cell->setParam("\\A_SIGNED", mul_signed ? State::S1 : State::S0);
-	cell->setParam("\\B_SIGNED", mul_signed ? State::S1 : State::S0);
+	cell->setParam("\\A_SIGNED", a_signed);
+	cell->setParam("\\B_SIGNED", b_signed);
 
 	pm.autoremove(st.mul);
-	pm.autoremove(st.ffY);
-	pm.autoremove(st.ffS);
+	pm.autoremove(st.ffH);
+	pm.autoremove(st.addAB);
+	if (st.ffO_lo) {
+			SigSpec O = st.sigO.extract(0,16);
+			st.ffO_lo->connections_.at("\\Q").replace(O, pm.module->addWire(NEW_ID, GetSize(O)));
+	}
+	if (st.ffO_hi) {
+			SigSpec O = st.sigO.extract(16,16);
+			st.ffO_hi->connections_.at("\\Q").replace(O, pm.module->addWire(NEW_ID, GetSize(O)));
+	}
 }
 
 struct Ice40DspPass : public Pass {
diff --git a/passes/pmgen/ice40_dsp.pmg b/passes/pmgen/ice40_dsp.pmg
index 1f3590d4e..73439cfd9 100644
--- a/passes/pmgen/ice40_dsp.pmg
+++ b/passes/pmgen/ice40_dsp.pmg
@@ -1,87 +1,88 @@
 pattern ice40_dsp
 
 state <SigBit> clock
-state <bool> clock_pol clock_vld
-state <SigSpec> sigA sigB sigY sigS
+state <bool> clock_pol sigCD_signed
+state <SigSpec> sigA sigB sigCD sigH sigO
 state <Cell*> addAB muxAB
 
 match mul
-	select mul->type.in($mul)
+	select mul->type.in($mul, $__MUL16X16)
 	select GetSize(mul->getPort(\A)) + GetSize(mul->getPort(\B)) > 10
 	select GetSize(mul->getPort(\Y)) > 10
 endmatch
 
 match ffA
 	select ffA->type.in($dff)
-	// select nusers(port(ffA, \Q)) == 2
-	index <SigSpec> port(ffA, \Q) === port(mul, \A)
+	filter !port(mul, \A).remove_const().empty()
+	filter includes(port(ffA, \Q).to_sigbit_set(), port(mul, \A).remove_const().to_sigbit_set())
 	optional
 endmatch
 
-code sigA clock clock_pol clock_vld
+code sigA clock clock_pol
 	sigA = port(mul, \A);
 
 	if (ffA) {
-		sigA = port(ffA, \D);
-
 		clock = port(ffA, \CLK).as_bit();
 		clock_pol = param(ffA, \CLK_POLARITY).as_bool();
-		clock_vld = true;
+
+		sigA.replace(port(ffA, \Q), port(ffA, \D));
 	}
 endcode
 
 match ffB
 	select ffB->type.in($dff)
-	// select nusers(port(ffB, \Q)) == 2
-	index <SigSpec> port(ffB, \Q) === port(mul, \B)
+	filter !port(mul, \B).remove_const().empty()
+	filter includes(port(ffB, \Q).to_sigbit_set(), port(mul, \B).remove_const().to_sigbit_set())
 	optional
 endmatch
 
-code sigB clock clock_pol clock_vld
+code sigB clock clock_pol
 	sigB = port(mul, \B);
 
 	if (ffB) {
-		sigB = port(ffB, \D);
 		SigBit c = port(ffB, \CLK).as_bit();
 		bool cp = param(ffB, \CLK_POLARITY).as_bool();
 
-		if (clock_vld && (c != clock || cp != clock_pol))
+		if (clock != SigBit() && (c != clock || cp != clock_pol))
 			reject;
 
 		clock = c;
 		clock_pol = cp;
-		clock_vld = true;
+
+		sigB.replace(port(ffB, \Q), port(ffB, \D));
 	}
 endcode
 
-match ffY
-	select ffY->type.in($dff)
-	select nusers(port(ffY, \D)) == 2
-	index <SigSpec> port(ffY, \D) === port(mul, \Y)
+match ffH
+	select ffH->type.in($dff)
+	select nusers(port(ffH, \D)) == 2
+	index <SigSpec> port(ffH, \D) === port(mul, \Y)
 	optional
 endmatch
 
-code sigY clock clock_pol clock_vld
-	sigY = port(mul, \Y);
+code sigH sigO clock clock_pol
+	sigH = port(mul, \Y);
+	sigO = sigH;
 
-	if (ffY) {
-		sigY = port(ffY, \Q);
-		SigBit c = port(ffY, \CLK).as_bit();
-		bool cp = param(ffY, \CLK_POLARITY).as_bool();
+	if (ffH) {
+		sigH = port(ffH, \Q);
+		sigO = sigH;
 
-		if (clock_vld && (c != clock || cp != clock_pol))
+		SigBit c = port(ffH, \CLK).as_bit();
+		bool cp = param(ffH, \CLK_POLARITY).as_bool();
+
+		if (clock != SigBit() && (c != clock || cp != clock_pol))
 			reject;
 
 		clock = c;
 		clock_pol = cp;
-		clock_vld = true;
 	}
 endcode
 
 match addA
 	select addA->type.in($add)
 	select nusers(port(addA, \A)) == 2
-	index <SigSpec> port(addA, \A) === sigY
+	index <SigSpec> port(addA, \A) === sigH
 	optional
 endmatch
 
@@ -89,74 +90,112 @@ match addB
 	if !addA
 	select addB->type.in($add, $sub)
 	select nusers(port(addB, \B)) == 2
-	index <SigSpec> port(addB, \B) === sigY
+	index <SigSpec> port(addB, \B) === sigH
 	optional
 endmatch
 
-code addAB sigS
+code addAB sigCD sigCD_signed sigO
 	if (addA) {
 		addAB = addA;
-		sigS = port(addA, \B);
+		sigCD = port(addAB, \B);
+		sigCD_signed = param(addAB, \B_SIGNED).as_bool();
 	}
 	if (addB) {
 		addAB = addB;
-		sigS = port(addB, \A);
+		sigCD = port(addAB, \A);
+		sigCD_signed = param(addAB, \A_SIGNED).as_bool();
 	}
 	if (addAB) {
 		int natural_mul_width = GetSize(sigA) + GetSize(sigB);
-		int actual_mul_width = GetSize(sigY);
-		int actual_acc_width = GetSize(sigS);
+		int actual_mul_width = GetSize(sigH);
+		int actual_acc_width = GetSize(sigO);
 
 		if ((actual_acc_width > actual_mul_width) && (natural_mul_width > actual_mul_width))
 			reject;
-		if ((actual_acc_width != actual_mul_width) && (param(mul, \A_SIGNED).as_bool() != param(addAB, \A_SIGNED).as_bool()))
+		if ((actual_acc_width != actual_mul_width) && (param(mul, \A_SIGNED).as_bool() != param(addAB, \B_SIGNED).as_bool()))
 			reject;
+
+		sigO = port(addAB, \Y);
 	}
 endcode
 
 match muxA
-	if addAB
 	select muxA->type.in($mux)
 	select nusers(port(muxA, \A)) == 2
-	index <SigSpec> port(muxA, \A) === port(addAB, \Y)
+	index <SigSpec> port(muxA, \A) === sigO
 	optional
 endmatch
 
 match muxB
-	if addAB
 	if !muxA
 	select muxB->type.in($mux)
 	select nusers(port(muxB, \B)) == 2
-	index <SigSpec> port(muxB, \B) === port(addAB, \Y)
+	index <SigSpec> port(muxB, \B) === sigO
 	optional
 endmatch
 
 code muxAB
-	muxAB = addAB;
 	if (muxA)
 		muxAB = muxA;
-	if (muxB)
+	else if (muxB)
 		muxAB = muxB;
 endcode
 
-match ffS
-	if muxAB
-	select ffS->type.in($dff)
-	select nusers(port(ffS, \D)) == 2
-	index <SigSpec> port(ffS, \D) === port(muxAB, \Y)
-	index <SigSpec> port(ffS, \Q) === sigS
+match ffO_lo
+	select ffO_lo->type.in($dff)
+	filter nusers(sigO.extract(0,16)) == 2
+	filter includes(port(ffO_lo, \D).to_sigbit_set(), sigO.extract(0,16).to_sigbit_set())
+	optional
 endmatch
 
-code clock clock_pol clock_vld
-	if (ffS) {
-		SigBit c = port(ffS, \CLK).as_bit();
-		bool cp = param(ffS, \CLK_POLARITY).as_bool();
-
-		if (clock_vld && (c != clock || cp != clock_pol))
-			reject;
+match ffO_hi
+	select ffO_hi->type.in($dff)
+	filter nusers(sigO.extract(16,16)) == 2
+	filter includes(port(ffO_hi, \D).to_sigbit_set(), sigO.extract(16,16).to_sigbit_set())
+	optional
+endmatch
 
-		clock = c;
-		clock_pol = cp;
-		clock_vld = true;
+code clock clock_pol sigO sigCD sigCD_signed
+	if (ffO_lo || ffO_hi) {
+		if (ffO_lo) {
+			SigBit c = port(ffO_lo, \CLK).as_bit();
+			bool cp = param(ffO_lo, \CLK_POLARITY).as_bool();
+
+			if (clock != SigBit() && (c != clock || cp != clock_pol))
+				reject;
+
+			clock = c;
+			clock_pol = cp;
+
+			if (port(ffO_lo, \Q) != sigO.extract(0,16))
+				sigO.replace(port(ffO_lo, \D), port(ffO_lo, \Q));
+		}
+
+		if (ffO_hi) {
+			SigBit c = port(ffO_hi, \CLK).as_bit();
+			bool cp = param(ffO_hi, \CLK_POLARITY).as_bool();
+
+			if (clock != SigBit() && (c != clock || cp != clock_pol))
+				reject;
+
+			clock = c;
+			clock_pol = cp;
+
+			if (port(ffO_hi, \Q) != sigO.extract(16,16))
+				sigO.replace(port(ffO_hi, \D), port(ffO_hi, \Q));
+		}
+
+		// Loading value into output register is not
+		//   supported unless using accumulator
+		if (muxAB) {
+			if (sigCD != sigO)
+				reject;
+			if (muxA)
+				sigCD = port(muxAB, \B);
+			else if (muxB)
+				sigCD = port(muxAB, \A);
+			else log_abort();
+			sigCD_signed = addAB && param(addAB, \A_SIGNED).as_bool() && param(addAB, \B_SIGNED).as_bool();
+		}
 	}
 endcode
diff --git a/passes/pmgen/xilinx_dsp.cc b/passes/pmgen/xilinx_dsp.cc
new file mode 100644
index 000000000..d87d63670
--- /dev/null
+++ b/passes/pmgen/xilinx_dsp.cc
@@ -0,0 +1,147 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Clifford Wolf <clifford@clifford.at>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "kernel/yosys.h"
+#include "kernel/sigtools.h"
+
+USING_YOSYS_NAMESPACE
+PRIVATE_NAMESPACE_BEGIN
+
+template<class T> bool includes(const T &lhs, const T &rhs) {
+	return std::includes(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
+}
+#include "passes/pmgen/xilinx_dsp_pm.h"
+
+void pack_xilinx_dsp(xilinx_dsp_pm &pm)
+{
+	auto &st = pm.st_xilinx_dsp;
+
+#if 1
+	log("\n");
+	log("ffA:   %s\n", log_id(st.ffA, "--"));
+	log("ffB:   %s\n", log_id(st.ffB, "--"));
+	log("dsp:   %s\n", log_id(st.dsp, "--"));
+	log("ffP:   %s\n", log_id(st.ffP, "--"));
+	log("muxP:  %s\n", log_id(st.muxP, "--"));
+	log("sigPused: %s\n", log_signal(st.sigPused));
+	log_module(pm.module);
+#endif
+
+	log("Analysing %s.%s for Xilinx DSP register packing.\n", log_id(pm.module), log_id(st.dsp));
+
+	Cell *cell = st.dsp;
+	log_assert(cell);
+
+	if (st.clock != SigBit())
+	{
+		cell->setPort("\\CLK", st.clock);
+
+		if (st.ffA) {
+			SigSpec A = cell->getPort("\\A");
+			SigSpec D = st.ffA->getPort("\\D");
+			SigSpec Q = st.ffA->getPort("\\Q");
+			A.replace(Q, D);
+			cell->setPort("\\A", A);
+			cell->setParam("\\AREG", State::S1);
+			if (st.ffA->type == "$dff")
+				cell->setPort("\\CEA2", State::S1);
+			else if (st.ffA->type == "$dffe")
+				cell->setPort("\\CEA2", st.ffA->getPort("\\EN"));
+			else log_abort();
+		}
+		if (st.ffB) {
+			SigSpec B = cell->getPort("\\B");
+			SigSpec D = st.ffB->getPort("\\D");
+			SigSpec Q = st.ffB->getPort("\\Q");
+			B.replace(Q, D);
+			cell->setPort("\\B", B);
+			cell->setParam("\\BREG", State::S1);
+			if (st.ffB->type == "$dff")
+				cell->setPort("\\CEB2", State::S1);
+			else if (st.ffB->type == "$dffe")
+				cell->setPort("\\CEB2", st.ffB->getPort("\\EN"));
+			else log_abort();
+		}
+		if (st.ffP) {
+			SigSpec P = cell->getPort("\\P");
+			SigSpec D;
+			if (st.muxP)
+				D = st.muxP->getPort("\\B");
+			else
+				D = st.ffP->getPort("\\D");
+			SigSpec Q = st.ffP->getPort("\\Q");
+			P.replace(D, Q);
+			cell->setPort("\\P", Q);
+			cell->setParam("\\PREG", State::S1);
+			if (st.ffP->type == "$dff")
+				cell->setPort("\\CEP", State::S1);
+			else if (st.ffP->type == "$dffe")
+				cell->setPort("\\CEP", st.ffP->getPort("\\EN"));
+			else log_abort();
+		}
+
+		log("  clock: %s (%s)", log_signal(st.clock), "posedge");
+
+		if (st.ffA)
+			log(" ffA:%s", log_id(st.ffA));
+
+		if (st.ffB)
+			log(" ffB:%s", log_id(st.ffB));
+
+		if (st.ffP)
+			log(" ffY:%s", log_id(st.ffP));
+
+		log("\n");
+	}
+
+	pm.blacklist(cell);
+}
+
+struct Ice40DspPass : public Pass {
+	Ice40DspPass() : Pass("xilinx_dsp", "Xilinx: pack DSP registers") { }
+	void help() YS_OVERRIDE
+	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
+		log("\n");
+		log("    xilinx_dsp [options] [selection]\n");
+		log("\n");
+		log("Pack registers into Xilinx DSPs\n");
+		log("\n");
+	}
+	void execute(std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
+	{
+		log_header(design, "Executing XILINX_DSP pass (pack DSPs).\n");
+
+		size_t argidx;
+		for (argidx = 1; argidx < args.size(); argidx++)
+		{
+			// if (args[argidx] == "-singleton") {
+			// 	singleton_mode = true;
+			// 	continue;
+			// }
+			break;
+		}
+		extra_args(args, argidx, design);
+
+		for (auto module : design->selected_modules())
+			xilinx_dsp_pm(module, module->selected_cells()).run_xilinx_dsp(pack_xilinx_dsp);
+	}
+} Ice40DspPass;
+
+PRIVATE_NAMESPACE_END
diff --git a/passes/pmgen/xilinx_dsp.pmg b/passes/pmgen/xilinx_dsp.pmg
new file mode 100644
index 000000000..a97ab4dd5
--- /dev/null
+++ b/passes/pmgen/xilinx_dsp.pmg
@@ -0,0 +1,102 @@
+pattern xilinx_dsp
+
+state <SigBit> clock
+state <SigSpec> sigPused
+
+match dsp
+	select dsp->type.in(\DSP48E1)
+endmatch
+
+match ffA
+	select ffA->type.in($dff, $dffe)
+	// DSP48E1 does not support clock inversion
+	select param(ffA, \CLK_POLARITY).as_bool()
+	filter !port(dsp, \A).remove_const().empty()
+	filter includes(port(ffA, \Q).to_sigbit_set(), port(dsp, \A).remove_const().to_sigbit_set())
+	optional
+endmatch
+
+code clock
+	if (ffA)
+		clock = port(ffA, \CLK).as_bit();
+endcode
+
+match ffB
+	select ffB->type.in($dff, $dffe)
+	// DSP48E1 does not support clock inversion
+	select param(ffB, \CLK_POLARITY).as_bool()
+	filter !port(dsp, \B).remove_const().empty()
+	filter includes(port(ffB, \Q).to_sigbit_set(), port(dsp, \B).remove_const().to_sigbit_set())
+	optional
+endmatch
+
+code clock
+	if (ffB) {
+		SigBit c = port(ffB, \CLK).as_bit();
+
+		if (clock != SigBit() && c != clock)
+			reject;
+
+		clock = c;
+	}
+endcode
+
+// Extract the bits of P that actually have a consumer
+// (as opposed to being a sign extension)
+code sigPused
+	SigSpec P = port(dsp, \P);
+	int i;
+	for (i = GetSize(P); i > 0; i--)
+		if (nusers(P[i-1]) > 1)
+			break;
+	sigPused = P.extract(0, i).remove_const();
+endcode
+
+match ffP
+	if !sigPused.empty()
+	select ffP->type.in($dff, $dffe)
+	select nusers(port(ffP, \D)) == 2
+	// DSP48E1 does not support clock inversion
+	select param(ffP, \CLK_POLARITY).as_bool()
+	filter param(ffP, \WIDTH).as_int() >= GetSize(sigPused)
+	filter includes(port(ffP, \D).to_sigbit_set(), sigPused.to_sigbit_set())
+	optional
+endmatch
+
+// $mux cell left behind by dff2dffe
+//   would prefer not to run 'opt_expr -mux_undef'
+//   since that would lose information helpful for
+//   efficient wide-mux inference
+match muxP
+	if !sigPused.empty() && !ffP
+	select muxP->type.in($mux)
+	select nusers(port(muxP, \B)) == 2
+	select port(muxP, \A).is_fully_undef()
+	filter param(muxP, \WIDTH).as_int() >= GetSize(sigPused)
+	filter includes(port(muxP, \B).to_sigbit_set(), sigPused.to_sigbit_set())
+	optional
+endmatch
+
+match ffY
+	if muxP
+	select ffY->type.in($dff, $dffe)
+	select nusers(port(ffY, \D)) == 2
+	// DSP48E1 does not support clock inversion
+	select param(ffY, \CLK_POLARITY).as_bool()
+	filter param(ffY, \WIDTH).as_int() >= GetSize(sigPused)
+	filter includes(port(ffY, \D).to_sigbit_set(), port(muxP, \Y).to_sigbit_set())
+endmatch
+
+code ffP clock
+	if (ffY)
+		ffP = ffY;
+
+	if (ffP) {
+		SigBit c = port(ffP, \CLK).as_bit();
+
+		if (clock != SigBit() && c != clock)
+			reject;
+
+		clock = c;
+	}
+endcode
diff --git a/passes/tests/test_autotb.cc b/passes/tests/test_autotb.cc
index bfb1d6642..7f11e54f3 100644
--- a/passes/tests/test_autotb.cc
+++ b/passes/tests/test_autotb.cc
@@ -348,6 +348,11 @@ struct TestAutotbBackend : public Backend {
 		log("    -n <int>\n");
 		log("        number of iterations the test bench should run (default = 1000)\n");
 		log("\n");
+		log("    -seed <int>\n");
+		log("        seed used for pseudo-random number generation (default = 0).\n");
+		log("        a value of 0 will cause an arbitrary seed to be chosen, based on\n");
+		log("        the current system time.\n");
+		log("\n");
 	}
 	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
 	{
diff --git a/techlibs/common/Makefile.inc b/techlibs/common/Makefile.inc
index 0e05620bc..e6d1c2f29 100644
--- a/techlibs/common/Makefile.inc
+++ b/techlibs/common/Makefile.inc
@@ -28,3 +28,4 @@ $(eval $(call add_share_file,share,techlibs/common/dff2ff.v))
 $(eval $(call add_share_file,share,techlibs/common/gate2lut.v))
 $(eval $(call add_share_file,share,techlibs/common/cmp2lut.v))
 $(eval $(call add_share_file,share,techlibs/common/cells.lib))
+$(eval $(call add_share_file,share,techlibs/common/mul2dsp.v))
diff --git a/techlibs/common/mul2dsp.v b/techlibs/common/mul2dsp.v
new file mode 100644
index 000000000..b28a4247e
--- /dev/null
+++ b/techlibs/common/mul2dsp.v
@@ -0,0 +1,302 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2012  Clifford Wolf <clifford@clifford.at>
+ *                2019  Eddie Hung    <eddie@fpgeh.com>
+ *                2019  David Shah    <dave@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ *  ---
+ *
+ *  Tech-mapping rules for decomposing arbitrarily-sized $mul cells
+ *  into an equivalent collection of smaller `DSP_NAME cells (with the 
+ *  same interface as $mul) no larger than `DSP_[AB]_MAXWIDTH, attached 
+ *  to $shl and $add cells.
+ *
+ */
+
+`ifndef DSP_A_MAXWIDTH
+$error("Macro DSP_A_MAXWIDTH must be defined");
+`endif
+`ifndef DSP_B_MAXWIDTH
+$error("Macro DSP_B_MAXWIDTH must be defined");
+`endif
+
+`ifndef DSP_NAME
+$error("Macro DSP_NAME must be defined");
+`endif
+
+`define MAX(a,b) (a > b ? a : b)
+`define MIN(a,b) (a < b ? a : b)
+
+module \$mul (A, B, Y); 
+	parameter A_SIGNED = 0;
+	parameter B_SIGNED = 0;
+	parameter A_WIDTH = 1;
+	parameter B_WIDTH = 1;
+	parameter Y_WIDTH = 1;
+
+	input [A_WIDTH-1:0] A;
+	input [B_WIDTH-1:0] B;
+	output [Y_WIDTH-1:0] Y;
+
+	generate
+	if (A_SIGNED != B_SIGNED || A_WIDTH <= 1 || B_WIDTH <= 1)
+		wire _TECHMAP_FAIL_ = 1;
+	// NB: A_SIGNED == B_SIGNED from here
+	else if (A_WIDTH < B_WIDTH)
+		\$mul #(
+			.A_SIGNED(B_SIGNED),
+			.B_SIGNED(A_SIGNED),
+			.A_WIDTH(B_WIDTH),
+			.B_WIDTH(A_WIDTH),
+			.Y_WIDTH(Y_WIDTH)
+		) _TECHMAP_REPLACE_ (
+			.A(B),
+			.B(A),
+			.Y(Y)
+		);
+	else
+		\$__mul #(
+			.A_SIGNED(A_SIGNED),
+			.B_SIGNED(B_SIGNED),
+			.A_WIDTH(A_WIDTH),
+			.B_WIDTH(B_WIDTH),
+			.Y_WIDTH(Y_WIDTH)
+		) _TECHMAP_REPLACE_ (
+			.A(A),
+			.B(B),
+			.Y(Y)
+		);
+	endgenerate
+endmodule
+
+module \$__mul (A, B, Y);
+	parameter A_SIGNED = 0;
+	parameter B_SIGNED = 0;
+	parameter A_WIDTH = 1;
+	parameter B_WIDTH = 1;
+	parameter Y_WIDTH = 1;
+
+	input [A_WIDTH-1:0] A;
+	input [B_WIDTH-1:0] B;
+	output [Y_WIDTH-1:0] Y;
+
+	wire [1023:0] _TECHMAP_DO_ = "proc; clean";
+
+`ifdef DSP_SIGNEDONLY
+	localparam sign_headroom = 1;
+`else
+	localparam sign_headroom = 0;
+`endif
+
+	genvar i;
+	generate
+		if (A_WIDTH <= 1 || B_WIDTH <= 1)
+			wire _TECHMAP_FAIL_ = 1;
+`ifdef DSP_MINWIDTH
+		else if (A_WIDTH+B_WIDTH < `DSP_MINWIDTH || Y_WIDTH < `DSP_MINWIDTH)
+			wire _TECHMAP_FAIL_ = 1;
+`endif
+		else if (A_WIDTH > `DSP_A_MAXWIDTH) begin
+			localparam n = (A_WIDTH+`DSP_A_MAXWIDTH-sign_headroom-1) / (`DSP_A_MAXWIDTH-sign_headroom);
+			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, B_WIDTH+`DSP_A_MAXWIDTH);
+			localparam last_Y_WIDTH = `MIN(partial_Y_WIDTH, B_WIDTH+A_WIDTH-(n-1)*(`DSP_A_MAXWIDTH-sign_headroom));
+			if (A_SIGNED && B_SIGNED) begin
+				wire signed [partial_Y_WIDTH-1:0] partial [n-2:0];
+				wire signed [last_Y_WIDTH-1:0] last_partial;
+				wire signed [Y_WIDTH-1:0] partial_sum [n-1:0];
+			end
+			else begin
+				wire [partial_Y_WIDTH-1:0] partial [n-1:0];
+				wire [last_Y_WIDTH-1:0] last_partial;
+				wire [Y_WIDTH-1:0] partial_sum [n-1:0];
+			end
+
+			\$__mul #(
+				.A_SIGNED(sign_headroom),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(`DSP_A_MAXWIDTH),
+				.B_WIDTH(B_WIDTH),
+				.Y_WIDTH(partial_Y_WIDTH)
+			) mul_slice_first (
+				.A({{sign_headroom{1'b0}}, A[`DSP_A_MAXWIDTH-sign_headroom-1 : 0]}),
+				.B(B),
+				.Y(partial[0])
+			);
+			assign partial_sum[0] = partial[0];
+
+			for (i = 1; i < n-1; i=i+1) begin:slice
+				\$__mul #(
+					.A_SIGNED(sign_headroom),
+					.B_SIGNED(B_SIGNED),
+					.A_WIDTH(`DSP_A_MAXWIDTH),
+					.B_WIDTH(B_WIDTH),
+					.Y_WIDTH(partial_Y_WIDTH)
+				) mul_slice (
+					.A({{sign_headroom{1'b0}}, A[i*(`DSP_A_MAXWIDTH-sign_headroom) +: `DSP_A_MAXWIDTH-sign_headroom]}),
+					.B(B),
+					.Y(partial[i])
+				);
+				// TODO: Currently a 'cascade' approach to summing the partial
+				//       products is taken here, but a more efficient 'binary
+				//       reduction' approach also exists...
+				assign partial_sum[i] = (partial[i] << i*(`DSP_A_MAXWIDTH-sign_headroom)) + partial_sum[i-1];
+			end
+
+			localparam last_A_WIDTH = A_WIDTH-(n-1)*(`DSP_A_MAXWIDTH-sign_headroom);
+			\$__mul #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(last_A_WIDTH),
+				.B_WIDTH(B_WIDTH),
+				.Y_WIDTH(last_Y_WIDTH)
+			) mul_slice_last (
+				.A(A[A_WIDTH-1 -: last_A_WIDTH]),
+				.B(B),
+				.Y(last_partial)
+			);
+			assign partial_sum[n-1] = (last_partial << (n-1)*(`DSP_A_MAXWIDTH-sign_headroom)) + partial_sum[n-2];
+			assign Y = partial_sum[n-1];
+		end
+		else if (B_WIDTH > `DSP_B_MAXWIDTH) begin
+			localparam n = (B_WIDTH+`DSP_B_MAXWIDTH-sign_headroom-1) / (`DSP_B_MAXWIDTH-sign_headroom);
+			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, A_WIDTH+`DSP_B_MAXWIDTH);
+			localparam last_Y_WIDTH = `MIN(partial_Y_WIDTH, A_WIDTH+B_WIDTH-(n-1)*(`DSP_B_MAXWIDTH-sign_headroom));
+			if (A_SIGNED && B_SIGNED) begin
+				wire signed [partial_Y_WIDTH-1:0] partial [n-2:0];
+				wire signed [last_Y_WIDTH-1:0] last_partial;
+				wire signed [Y_WIDTH-1:0] partial_sum [n-1:0];
+			end
+			else begin
+				wire [partial_Y_WIDTH-1:0] partial [n-1:0];
+				wire [last_Y_WIDTH-1:0] last_partial;
+				wire [Y_WIDTH-1:0] partial_sum [n-1:0];
+			end
+
+			\$__mul #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(sign_headroom),
+				.A_WIDTH(A_WIDTH),
+				.B_WIDTH(`DSP_B_MAXWIDTH),
+				.Y_WIDTH(partial_Y_WIDTH)
+			) mul_first (
+				.A(A),
+				.B({{sign_headroom{1'b0}}, B[`DSP_B_MAXWIDTH-sign_headroom-1 : 0]}),
+				.Y(partial[0])
+			);
+			assign partial_sum[0] = partial[0];
+
+			for (i = 1; i < n-1; i=i+1) begin:slice
+				\$__mul #(
+					.A_SIGNED(A_SIGNED),
+					.B_SIGNED(sign_headroom),
+					.A_WIDTH(A_WIDTH),
+					.B_WIDTH(`DSP_B_MAXWIDTH),
+					.Y_WIDTH(partial_Y_WIDTH)
+				) mul (
+					.A(A),
+					.B({{sign_headroom{1'b0}}, B[i*(`DSP_B_MAXWIDTH-sign_headroom) +: `DSP_B_MAXWIDTH-sign_headroom]}),
+					.Y(partial[i])
+				);
+				// TODO: Currently a 'cascade' approach to summing the partial 
+				//       products is taken here, but a more efficient 'binary
+				//       reduction' approach also exists...
+				assign partial_sum[i] = (partial[i] << i*(`DSP_B_MAXWIDTH-sign_headroom)) + partial_sum[i-1];
+			end
+
+			localparam last_B_WIDTH = B_WIDTH-(n-1)*(`DSP_B_MAXWIDTH-sign_headroom);
+			\$__mul #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(A_WIDTH),
+				.B_WIDTH(last_B_WIDTH),
+				.Y_WIDTH(last_Y_WIDTH)
+			) mul_last (
+				.A(A),
+				.B(B[B_WIDTH-1 -: last_B_WIDTH]),
+				.Y(last_partial)
+			);
+			assign partial_sum[n-1] = (last_partial << (n-1)*(`DSP_B_MAXWIDTH-sign_headroom)) + partial_sum[n-2];
+			assign Y = partial_sum[n-1];
+		end
+		else begin 
+			`DSP_NAME #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(A_WIDTH),
+				.B_WIDTH(B_WIDTH),
+				.Y_WIDTH(`MIN(Y_WIDTH,A_WIDTH+B_WIDTH)),
+			) _TECHMAP_REPLACE_ (
+				.A(A),
+				.B(B),
+				.Y(Y)
+			);
+		end
+	endgenerate
+endmodule
+
+(* techmap_celltype = "$__mul" *)
+module $__soft_mul (A, B, Y); 
+	parameter A_SIGNED = 0;
+	parameter B_SIGNED = 0;
+	parameter A_WIDTH = 1;
+	parameter B_WIDTH = 1;
+	parameter Y_WIDTH = 1;
+
+	input [A_WIDTH-1:0] A;
+	input [B_WIDTH-1:0] B;
+	output [Y_WIDTH-1:0] Y;
+
+	// Indirection necessary since mapping
+	//   back to $mul will cause recursion
+	generate
+		if (A_SIGNED && !B_SIGNED)
+			\$__soft__mul #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(1),
+				.A_WIDTH(A_WIDTH),
+				.B_WIDTH(B_WIDTH+1),
+				.Y_WIDTH(Y_WIDTH)
+			) _TECHMAP_REPLACE_ (
+				.A(A),
+				.B({1'b0,B}),
+				.Y(Y)
+			);
+		else if (!A_SIGNED && B_SIGNED)
+			\$__soft_mul #(
+				.A_SIGNED(1),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(A_WIDTH+1),
+				.B_WIDTH(B_WIDTH),
+				.Y_WIDTH(Y_WIDTH)
+			) _TECHMAP_REPLACE_ (
+				.A({1'b0,A}),
+				.B(B),
+				.Y(Y)
+			);
+		else
+			\$__soft_mul #(
+				.A_SIGNED(A_SIGNED),
+				.B_SIGNED(B_SIGNED),
+				.A_WIDTH(A_WIDTH),
+				.B_WIDTH(B_WIDTH),
+				.Y_WIDTH(Y_WIDTH)
+			) _TECHMAP_REPLACE_ (
+				.A(A),
+				.B(B),
+				.Y(Y)
+			);
+	endgenerate
+endmodule
diff --git a/techlibs/ecp5/Makefile.inc b/techlibs/ecp5/Makefile.inc
index 73e18112f..c7d6eee02 100644
--- a/techlibs/ecp5/Makefile.inc
+++ b/techlibs/ecp5/Makefile.inc
@@ -10,6 +10,7 @@ $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/brams_map.v))
 $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/bram.txt))
 $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/arith_map.v))
 $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/latches_map.v))
+$(eval $(call add_share_file,share/ecp5,techlibs/ecp5/dsp_map.v))
 
 $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/abc_5g.box))
 $(eval $(call add_share_file,share/ecp5,techlibs/ecp5/abc_5g.lut))
diff --git a/techlibs/ecp5/dsp_map.v b/techlibs/ecp5/dsp_map.v
new file mode 100644
index 000000000..24e28869e
--- /dev/null
+++ b/techlibs/ecp5/dsp_map.v
@@ -0,0 +1,10 @@
+module \$__MUL18X18 (input [17:0] A, input [17:0] B, output [35:0] Y);
+	MULT18X18D _TECHMAP_REPLACE_ (
+		.A0(A[0]), .A1(A[1]), .A2(A[2]), .A3(A[3]), .A4(A[4]), .A5(A[5]), .A6(A[6]), .A7(A[7]), .A8(A[8]), .A9(A[9]), .A10(A[10]), .A11(A[11]), .A12(A[12]), .A13(A[13]), .A14(A[14]), .A15(A[15]), .A16(A[16]), .A17(A[17]),
+		.B0(B[0]), .B1(B[1]), .B2(B[2]), .B3(B[3]), .B4(B[4]), .B5(B[5]), .B6(B[6]), .B7(B[7]), .B8(B[8]), .B9(B[9]), .B10(B[10]), .B11(B[11]), .B12(B[12]), .B13(B[13]), .B14(B[14]), .B15(B[15]), .B16(B[16]), .B17(B[17]),
+		.C17(1'b0), .C16(1'b0), .C15(1'b0), .C14(1'b0), .C13(1'b0), .C12(1'b0), .C11(1'b0), .C10(1'b0), .C9(1'b0), .C8(1'b0), .C7(1'b0), .C6(1'b0), .C5(1'b0), .C4(1'b0), .C3(1'b0), .C2(1'b0), .C1(1'b0), .C0(1'b0),
+		.SIGNEDA(1'b0), .SIGNEDB(1'b0), .SOURCEA(1'b0), .SOURCEB(1'b0),
+
+		.P0(Y[0]), .P1(Y[1]), .P2(Y[2]), .P3(Y[3]), .P4(Y[4]), .P5(Y[5]), .P6(Y[6]), .P7(Y[7]), .P8(Y[8]), .P9(Y[9]), .P10(Y[10]), .P11(Y[11]), .P12(Y[12]), .P13(Y[13]), .P14(Y[14]), .P15(Y[15]), .P16(Y[16]), .P17(Y[17]), .P18(Y[18]), .P19(Y[19]), .P20(Y[20]), .P21(Y[21]), .P22(Y[22]), .P23(Y[23]), .P24(Y[24]), .P25(Y[25]), .P26(Y[26]), .P27(Y[27]), .P28(Y[28]), .P29(Y[29]), .P30(Y[30]), .P31(Y[31]), .P32(Y[32]), .P33(Y[33]), .P34(Y[34]), .P35(Y[35])
+	);
+endmodule
diff --git a/techlibs/ecp5/synth_ecp5.cc b/techlibs/ecp5/synth_ecp5.cc
index 143d1f95c..3129ba929 100644
--- a/techlibs/ecp5/synth_ecp5.cc
+++ b/techlibs/ecp5/synth_ecp5.cc
@@ -89,6 +89,9 @@ struct SynthEcp5Pass : public ScriptPass
 		log("        generate an output netlist (and BLIF file) suitable for VPR\n");
 		log("        (this feature is experimental and incomplete)\n");
 		log("\n");
+		log("    -dsp\n");
+		log("        map multipliers to MULT18X18D (EXPERIMENTAL)\n");
+		log("\n");
 		log("\n");
 		log("The following commands are executed by this synthesis command:\n");
 		help_script();
@@ -96,7 +99,7 @@ struct SynthEcp5Pass : public ScriptPass
 	}
 
 	string top_opt, blif_file, edif_file, json_file;
-	bool noccu2, nodffe, nobram, nolutram, nowidelut, flatten, retime, abc2, abc9, vpr;
+	bool noccu2, nodffe, nobram, nolutram, nowidelut, flatten, retime, abc2, abc9, dsp, vpr;
 
 	void clear_flags() YS_OVERRIDE
 	{
@@ -114,6 +117,7 @@ struct SynthEcp5Pass : public ScriptPass
 		abc2 = false;
 		vpr = false;
 		abc9 = false;
+		dsp = false;
 	}
 
 	void execute(std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
@@ -192,6 +196,10 @@ struct SynthEcp5Pass : public ScriptPass
 				abc9 = true;
 				continue;
 			}
+			if (args[argidx] == "-dsp") {
+				dsp = true;
+				continue;
+			}
 			break;
 		}
 		extra_args(args, argidx, design);
@@ -228,7 +236,28 @@ struct SynthEcp5Pass : public ScriptPass
 
 		if (check_label("coarse"))
 		{
-			run("synth -run coarse");
+			run("opt_expr");
+			run("opt_clean");
+			run("check");
+			run("opt");
+			run("wreduce");
+			run("peepopt");
+			run("opt_clean");
+			run("share");
+			run("techmap -map +/cmp2lut.v -D LUT_WIDTH=4");
+			run("opt_expr");
+			run("opt_clean");
+			if (dsp) {
+				run("techmap -map +/mul2dsp.v -D DSP_A_MAXWIDTH=18 -D DSP_B_MAXWIDTH=18 -D DSP_NAME=$__MUL18X18");
+				run("clean");
+				run("techmap -map +/ecp5/dsp_map.v");
+			}
+			run("alumacc");
+			run("opt");
+			run("fsm");
+			run("opt -fast");
+			run("memory -nomap");
+			run("opt_clean");
 		}
 
 		if (!nobram && check_label("map_bram", "(skip if -nobram)"))
diff --git a/techlibs/ice40/synth_ice40.cc b/techlibs/ice40/synth_ice40.cc
index be60a0071..ce88a0542 100644
--- a/techlibs/ice40/synth_ice40.cc
+++ b/techlibs/ice40/synth_ice40.cc
@@ -265,8 +265,13 @@ struct SynthIce40Pass : public ScriptPass
 			run("techmap -map +/cmp2lut.v -D LUT_WIDTH=4");
 			run("opt_expr");
 			run("opt_clean");
-			if (help_mode || dsp)
-				run("ice40_dsp", "(if -dsp)");
+			if (help_mode || dsp) {
+				run("techmap -map +/mul2dsp.v -D DSP_A_MAXWIDTH=16 -D DSP_B_MAXWIDTH=16 -D DSP_MINWIDTH=11 -D DSP_NAME=$__MUL16X16", "(if -dsp)");
+				run("opt_expr", "                     (if -dsp)");
+				run("wreduce", "                      (if -dsp)");
+				run("ice40_dsp", "                    (if -dsp)");
+				run("chtype -set $mul t:$__soft_mul","(if -dsp)");
+			}
 			run("alumacc");
 			run("opt");
 			run("fsm");
diff --git a/techlibs/xilinx/Makefile.inc b/techlibs/xilinx/Makefile.inc
index 2c6e7432e..b0251d621 100644
--- a/techlibs/xilinx/Makefile.inc
+++ b/techlibs/xilinx/Makefile.inc
@@ -38,6 +38,7 @@ $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/arith_map.v))
 $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/ff_map.v))
 $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/lut_map.v))
 $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/mux_map.v))
+$(eval $(call add_share_file,share/xilinx,techlibs/xilinx/dsp_map.v))
 
 $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/abc_xc7.box))
 $(eval $(call add_share_file,share/xilinx,techlibs/xilinx/abc_xc7.lut))
diff --git a/techlibs/xilinx/cells_sim.v b/techlibs/xilinx/cells_sim.v
index 05e46b4e7..33b2a8f62 100644
--- a/techlibs/xilinx/cells_sim.v
+++ b/techlibs/xilinx/cells_sim.v
@@ -378,3 +378,150 @@ module SRLC32E (
       always @(posedge CLK) if (CE) r <= { r[30:0], D };
   endgenerate
 endmodule
+
+module DSP48E1 (
+    output [29:0] ACOUT,
+    output [17:0] BCOUT,
+    output CARRYCASCOUT,
+    output [3:0] CARRYOUT,
+    output MULTSIGNOUT,
+    output OVERFLOW,
+    output reg signed [47:0] P,
+    output PATTERNBDETECT,
+    output PATTERNDETECT,
+    output [47:0] PCOUT,
+    output UNDERFLOW,
+    input signed [29:0] A,
+    input [29:0] ACIN,
+    input [3:0] ALUMODE,
+    input signed [17:0] B,
+    input [17:0] BCIN,
+    input [47:0] C,
+    input CARRYCASCIN,
+    input CARRYIN,
+    input [2:0] CARRYINSEL,
+    input CEA1,
+    input CEA2,
+    input CEAD,
+    input CEALUMODE,
+    input CEB1,
+    input CEB2,
+    input CEC,
+    input CECARRYIN,
+    input CECTRL,
+    input CED,
+    input CEINMODE,
+    input CEM,
+    input CEP,
+    input CLK,
+    input [24:0] D,
+    input [4:0] INMODE,
+    input MULTSIGNIN,
+    input [6:0] OPMODE,
+    input [47:0] PCIN,
+    input RSTA,
+    input RSTALLCARRYIN,
+    input RSTALUMODE,
+    input RSTB,
+    input RSTC,
+    input RSTCTRL,
+    input RSTD,
+    input RSTINMODE,
+    input RSTM,
+    input RSTP
+);
+    parameter integer ACASCREG = 1;
+    parameter integer ADREG = 1;
+    parameter integer ALUMODEREG = 1;
+    parameter integer AREG = 1;
+    parameter AUTORESET_PATDET = "NO_RESET";
+    parameter A_INPUT = "DIRECT";
+    parameter integer BCASCREG = 1;
+    parameter integer BREG = 1;
+    parameter B_INPUT = "DIRECT";
+    parameter integer CARRYINREG = 1;
+    parameter integer CARRYINSELREG = 1;
+    parameter integer CREG = 1;
+    parameter integer DREG = 1;
+    parameter integer INMODEREG = 1;
+    parameter integer MREG = 1;
+    parameter integer OPMODEREG = 1;
+    parameter integer PREG = 1;
+    parameter SEL_MASK = "MASK";
+    parameter SEL_PATTERN = "PATTERN";
+    parameter USE_DPORT = "FALSE";
+    parameter USE_MULT = "MULTIPLY";
+    parameter USE_PATTERN_DETECT = "NO_PATDET";
+    parameter USE_SIMD = "ONE48";
+    parameter [47:0] MASK = 48'h3FFFFFFFFFFF;
+    parameter [47:0] PATTERN = 48'h000000000000;
+    parameter [3:0] IS_ALUMODE_INVERTED = 4'b0;
+    parameter [0:0] IS_CARRYIN_INVERTED = 1'b0;
+    parameter [0:0] IS_CLK_INVERTED = 1'b0;
+    parameter [4:0] IS_INMODE_INVERTED = 5'b0;
+    parameter [6:0] IS_OPMODE_INVERTED = 7'b0;
+
+    initial begin
+`ifdef __ICARUS__
+        if (ACASCREG != 0)          $fatal(1, "Unsupported ACASCREG value");
+        if (ADREG != 0)             $fatal(1, "Unsupported ADREG value");
+        if (ALUMODEREG != 0)        $fatal(1, "Unsupported ALUMODEREG value");
+        if (AREG == 2)              $fatal(1, "Unsupported AREG value");
+        if (AUTORESET_PATDET != "NO_RESET") $fatal(1, "Unsupported AUTORESET_PATDET value");
+        if (A_INPUT != "DIRECT")    $fatal(1, "Unsupported A_INPUT value");
+        if (BCASCREG != 0)          $fatal(1, "Unsupported BCASCREG value");
+        if (BREG == 2)              $fatal(1, "Unsupported BREG value");
+        if (B_INPUT != "DIRECT")    $fatal(1, "Unsupported B_INPUT value");
+        if (CARRYINREG != 0)        $fatal(1, "Unsupported CARRYINREG value");
+        if (CARRYINSELREG != 0)     $fatal(1, "Unsupported CARRYINSELREG value");
+        if (CREG != 0)              $fatal(1, "Unsupported CREG value");
+        if (DREG != 0)              $fatal(1, "Unsupported DREG value");
+        if (INMODEREG != 0)         $fatal(1, "Unsupported INMODEREG value");
+        if (MREG != 0)              $fatal(1, "Unsupported MREG value");
+        if (OPMODEREG != 0)         $fatal(1, "Unsupported OPMODEREG value");
+        //if (PREG != 0)              $fatal(1, "Unsupported PREG value");
+        if (SEL_MASK != "MASK")     $fatal(1, "Unsupported SEL_MASK value");
+        if (SEL_PATTERN != "PATTERN") $fatal(1, "Unsupported SEL_PATTERN value");
+        if (USE_DPORT != "FALSE")   $fatal(1, "Unsupported USE_DPORT value");
+        if (USE_MULT != "MULTIPLY") $fatal(1, "Unsupported USE_MULT value");
+        if (USE_PATTERN_DETECT != "NO_PATDET") $fatal(1, "Unsupported USE_PATTERN_DETECT value");
+        if (USE_SIMD != "ONE48")    $fatal(1, "Unsupported USE_SIMD value");
+        if (IS_ALUMODE_INVERTED != 4'b0) $fatal(1, "Unsupported IS_ALUMODE_INVERTED value");
+        if (IS_CARRYIN_INVERTED != 1'b0) $fatal(1, "Unsupported IS_CARRYIN_INVERTED value");
+        if (IS_CLK_INVERTED != 1'b0) $fatal(1, "Unsupported IS_CLK_INVERTED value");
+        if (IS_INMODE_INVERTED != 5'b0) $fatal(1, "Unsupported IS_INMODE_INVERTED value");
+        if (IS_OPMODE_INVERTED != 7'b0) $fatal(1, "Unsupported IS_OPMODE_INVERTED value");
+`endif
+    end
+
+    reg signed [29:0] Ar;
+    reg signed [17:0] Br;
+    reg signed [47:0] Pr;
+    generate
+        if (AREG == 1) begin always @(posedge CLK) if (CEA2) Ar <= A; end
+        else           always @* Ar <= A;
+        if (BREG == 1) begin always @(posedge CLK) if (CEB2) Br <= B; end
+        else           always @* Br <= B;
+    endgenerate
+
+    always @* begin
+        Pr <= {48{1'bx}};
+`ifdef __ICARUS__
+        if (INMODE != 4'b0000)      $fatal(1, "Unsupported INMODE value");
+        if (ALUMODE != 4'b0000)     $fatal(1, "Unsupported ALUMODE value");
+        if (OPMODE != 7'b000101)    $fatal(1, "Unsupported OPMODE value");
+        if (CARRYINSEL != 3'b000)   $fatal(1, "Unsupported CARRYINSEL value");
+        if (ACIN != 30'b0)          $fatal(1, "Unsupported ACIN value");
+        if (BCIN != 18'b0)          $fatal(1, "Unsupported BCIN value");
+        if (PCIN != 48'b0)          $fatal(1, "Unsupported PCIN value");
+        if (CARRYIN != 1'b0)        $fatal(1, "Unsupported CARRYIN value");
+`endif
+        Pr[42:0] <= $signed(Ar[24:0]) * Br;
+    end
+
+    generate
+        if (PREG == 1) begin always @(posedge CLK) if (CEP) P <= Pr; end
+        else           always @* P <= Pr;
+    endgenerate
+
+endmodule
diff --git a/techlibs/xilinx/cells_xtra.v b/techlibs/xilinx/cells_xtra.v
index 15fa1b63a..d79349225 100644
--- a/techlibs/xilinx/cells_xtra.v
+++ b/techlibs/xilinx/cells_xtra.v
@@ -111,88 +111,6 @@ module DNA_PORT (...);
     input CLK, DIN, READ, SHIFT;
 endmodule
 
-module DSP48E1 (...);
-    parameter integer ACASCREG = 1;
-    parameter integer ADREG = 1;
-    parameter integer ALUMODEREG = 1;
-    parameter integer AREG = 1;
-    parameter AUTORESET_PATDET = "NO_RESET";
-    parameter A_INPUT = "DIRECT";
-    parameter integer BCASCREG = 1;
-    parameter integer BREG = 1;
-    parameter B_INPUT = "DIRECT";
-    parameter integer CARRYINREG = 1;
-    parameter integer CARRYINSELREG = 1;
-    parameter integer CREG = 1;
-    parameter integer DREG = 1;
-    parameter integer INMODEREG = 1;
-    parameter integer MREG = 1;
-    parameter integer OPMODEREG = 1;
-    parameter integer PREG = 1;
-    parameter SEL_MASK = "MASK";
-    parameter SEL_PATTERN = "PATTERN";
-    parameter USE_DPORT = "FALSE";
-    parameter USE_MULT = "MULTIPLY";
-    parameter USE_PATTERN_DETECT = "NO_PATDET";
-    parameter USE_SIMD = "ONE48";
-    parameter [47:0] MASK = 48'h3FFFFFFFFFFF;
-    parameter [47:0] PATTERN = 48'h000000000000;
-    parameter [3:0] IS_ALUMODE_INVERTED = 4'b0;
-    parameter [0:0] IS_CARRYIN_INVERTED = 1'b0;
-    parameter [0:0] IS_CLK_INVERTED = 1'b0;
-    parameter [4:0] IS_INMODE_INVERTED = 5'b0;
-    parameter [6:0] IS_OPMODE_INVERTED = 7'b0;
-    output [29:0] ACOUT;
-    output [17:0] BCOUT;
-    output CARRYCASCOUT;
-    output [3:0] CARRYOUT;
-    output MULTSIGNOUT;
-    output OVERFLOW;
-    output [47:0] P;
-    output PATTERNBDETECT;
-    output PATTERNDETECT;
-    output [47:0] PCOUT;
-    output UNDERFLOW;
-    input [29:0] A;
-    input [29:0] ACIN;
-    input [3:0] ALUMODE;
-    input [17:0] B;
-    input [17:0] BCIN;
-    input [47:0] C;
-    input CARRYCASCIN;
-    input CARRYIN;
-    input [2:0] CARRYINSEL;
-    input CEA1;
-    input CEA2;
-    input CEAD;
-    input CEALUMODE;
-    input CEB1;
-    input CEB2;
-    input CEC;
-    input CECARRYIN;
-    input CECTRL;
-    input CED;
-    input CEINMODE;
-    input CEM;
-    input CEP;
-    input CLK;
-    input [24:0] D;
-    input [4:0] INMODE;
-    input MULTSIGNIN;
-    input [6:0] OPMODE;
-    input [47:0] PCIN;
-    input RSTA;
-    input RSTALLCARRYIN;
-    input RSTALUMODE;
-    input RSTB;
-    input RSTC;
-    input RSTCTRL;
-    input RSTD;
-    input RSTINMODE;
-    input RSTM;
-    input RSTP;
-endmodule
-
 module EFUSE_USR (...);
     parameter [31:0] SIM_EFUSE_VALUE = 32'h00000000;
     output [31:0] EFUSEUSR;
diff --git a/techlibs/xilinx/dsp_map.v b/techlibs/xilinx/dsp_map.v
new file mode 100644
index 000000000..3d7b09d69
--- /dev/null
+++ b/techlibs/xilinx/dsp_map.v
@@ -0,0 +1,46 @@
+module \$__MUL25X18 (input signed [24:0] A, input signed [17:0] B, output signed [42:0] Y);
+	parameter A_SIGNED = 0;
+	parameter B_SIGNED = 0;
+	parameter A_WIDTH = 0;
+	parameter B_WIDTH = 0;
+	parameter Y_WIDTH = 0;
+
+	wire [47:0] P_48;
+	DSP48E1 #(
+		// Disable all registers
+		.ACASCREG(0),
+		.ADREG(0),
+		.A_INPUT("DIRECT"),
+		.ALUMODEREG(0),
+		.AREG(0),
+		.BCASCREG(0),
+		.B_INPUT("DIRECT"),
+		.BREG(0),
+		.CARRYINREG(0),
+		.CARRYINSELREG(0),
+		.CREG(0),
+		.DREG(0),
+		.INMODEREG(0),
+		.MREG(0),
+		.OPMODEREG(0),
+		.PREG(0)
+	) _TECHMAP_REPLACE_ (
+		//Data path
+		.A({{5{A[24]}}, A}),
+		.B(B),
+		.C(48'b0),
+		.D(24'b0),
+		.P(P_48),
+
+		.INMODE(4'b0000),
+		.ALUMODE(4'b0000),
+		.OPMODE(7'b000101),
+		.CARRYINSEL(3'b000),
+
+		.ACIN(30'b0),
+		.BCIN(18'b0),
+		.PCIN(48'b0),
+		.CARRYIN(1'b0)
+	);
+	assign Y = P_48;
+endmodule
diff --git a/techlibs/xilinx/synth_xilinx.cc b/techlibs/xilinx/synth_xilinx.cc
index b672a0d4f..e5a27015a 100644
--- a/techlibs/xilinx/synth_xilinx.cc
+++ b/techlibs/xilinx/synth_xilinx.cc
@@ -78,6 +78,9 @@ struct SynthXilinxPass : public ScriptPass
 		log("    -nowidelut\n");
 		log("        do not use MUXF[78] resources to implement LUTs larger than LUT6s\n");
 		log("\n");
+		log("    -nodsp\n");
+		log("        do not use DSP48E1s to implement multipliers and associated logic\n");
+		log("\n");
 		log("    -widemux <int>\n");
 		log("        enable inference of hard multiplexer resources (MUXF[78]) for muxes at or\n");
 		log("        above this number of inputs (minimum value 2, recommended value >= 5).\n");
@@ -104,7 +107,7 @@ struct SynthXilinxPass : public ScriptPass
 	}
 
 	std::string top_opt, edif_file, blif_file, family;
-	bool flatten, retime, vpr, nobram, nodram, nosrl, nocarry, nowidelut, abc9;
+	bool flatten, retime, vpr, nobram, nodram, nosrl, nocarry, nowidelut, nodsp, abc9;
 	int widemux;
 
 	void clear_flags() YS_OVERRIDE
@@ -122,6 +125,7 @@ struct SynthXilinxPass : public ScriptPass
 		nosrl = false;
 		nocarry = false;
 		nowidelut = false;
+		nodsp = false;
 		abc9 = false;
 		widemux = 0;
 	}
@@ -202,6 +206,10 @@ struct SynthXilinxPass : public ScriptPass
 				abc9 = true;
 				continue;
 			}
+			if (args[argidx] == "-nodsp") {
+				nodsp = true;
+				continue;
+			}
 			break;
 		}
 		extra_args(args, argidx, design);
@@ -249,8 +257,8 @@ struct SynthXilinxPass : public ScriptPass
 
 		if (check_label("coarse")) {
 			run("proc");
-			if (help_mode || flatten)
-				run("flatten", "(if -flatten)");
+			if (flatten || help_mode)
+				run("flatten", "(with '-flatten')");
 			run("opt_expr");
 			run("opt_clean");
 			run("check");
@@ -275,6 +283,12 @@ struct SynthXilinxPass : public ScriptPass
 			}
 
 			run("techmap -map +/cmp2lut.v -D LUT_WIDTH=6");
+
+			if (!nodsp || help_mode) {
+				// NB: Xilinx multipliers are signed only
+				run("techmap -map +/mul2dsp.v -D DSP_A_MAXWIDTH=25 -D DSP_B_MAXWIDTH=18 -D DSP_SIGNEDONLY=1 -D DSP_NAME=$__MUL25X18");
+			}
+
 			run("alumacc");
 			run("share");
 			run("opt");
@@ -317,6 +331,10 @@ struct SynthXilinxPass : public ScriptPass
 			run("memory_map");
 			run("dffsr2dff");
 			run("dff2dffe");
+			if (help_mode || !nodsp) {
+				run("techmap -map +/xilinx/dsp_map.v", "(skip if '-nodsp')");
+				run("xilinx_dsp", "                     (skip if '-nodsp')");
+			}
 			if (help_mode) {
 				run("simplemap t:$mux", "         ('-widemux' only)");
 				run("muxcover <internal options>, ('-widemux' only)");
diff --git a/tests/various/wreduce.ys b/tests/various/wreduce.ys
new file mode 100644
index 000000000..deb99304d
--- /dev/null
+++ b/tests/various/wreduce.ys
@@ -0,0 +1,118 @@
+
+read_verilog <<EOT
+module wreduce_add_test(input [3:0] i, input [7:0] j, output [8:0] o);
+    assign o = (i << 4) + j;
+endmodule
+EOT
+
+hierarchy -auto-top
+proc
+design -save gold
+
+prep # calls wreduce
+
+select -assert-count 1 t:$add r:A_WIDTH=4 r:B_WIDTH=4 r:Y_WIDTH=5 %i %i %i
+
+design -stash gate
+
+design -import gold -as gold
+design -import gate -as gate
+
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -verify -prove-asserts -show-ports miter
+
+##########
+
+read_verilog <<EOT
+module wreduce_sub_test1(input [3:0] i, input [7:0] j, output [8:0] o);
+    assign o = j - (i << 4);
+endmodule
+EOT
+
+hierarchy -auto-top
+proc
+design -save gold
+
+prep # calls wreduce
+
+select -assert-count 1 t:$sub r:A_WIDTH=4 r:B_WIDTH=4 r:Y_WIDTH=5 %i %i %i
+
+design -stash gate
+
+design -import gold -as gold
+design -import gate -as gate
+
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -verify -prove-asserts -show-ports miter
+
+##########
+
+read_verilog <<EOT
+module wreduce_sub_test2(input [3:0] i, input [7:0] j, output [8:0] o);
+    assign o = (i << 4) - j;
+endmodule
+EOT
+
+hierarchy -auto-top
+proc
+design -save gold
+
+prep # calls wreduce
+
+select -assert-count 1 t:$sub r:A_WIDTH=8 r:B_WIDTH=8 r:Y_WIDTH=9 %i %i %i
+
+design -stash gate
+
+design -import gold -as gold
+design -import gate -as gate
+
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -verify -prove-asserts -show-ports miter
+
+##########
+
+read_verilog <<EOT
+module wreduce_sub_test3(input [3:0] i, input [7:0] j, output [8:0] o);
+    assign o = (j >> 4) - i;
+endmodule
+EOT
+
+hierarchy -auto-top
+proc
+design -save gold
+
+prep # calls wreduce
+
+select -assert-count 1 t:$sub r:A_WIDTH=4 r:B_WIDTH=4 r:Y_WIDTH=5 %i %i %i
+
+design -stash gate
+
+design -import gold -as gold
+design -import gate -as gate
+
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -verify -prove-asserts -show-ports miter
+
+##########
+
+read_verilog <<EOT
+module wreduce_sub_test4(input [3:0] i, output [8:0] o);
+    assign o = 5'b00010 - i;
+endmodule
+EOT
+
+hierarchy -auto-top
+proc
+design -save gold
+
+prep # calls wreduce
+
+select -assert-count 1 t:$sub r:A_WIDTH=2 r:B_WIDTH=4 r:Y_WIDTH=5 %i %i %i
+
+design -stash gate
+
+design -import gold -as gold
+design -import gate -as gate
+
+miter -equiv -flatten -make_assert -make_outputs gold gate miter
+sat -verify -prove-asserts -show-ports miter