9 files changed, 246 insertions, 200 deletions
diff --git a/kernel/rtlil.cc b/kernel/rtlil.cc
index 909f60dd9..fade0bc36 100644
--- a/kernel/rtlil.cc
+++ b/kernel/rtlil.cc
@@ -3299,7 +3299,7 @@ void RTLIL::SigSpec::replace(int offset, const RTLIL::SigSpec &with)
 	check();
 }
 
-RTLIL::SigSpec& RTLIL::SigSpec::remove_const()
+void RTLIL::SigSpec::remove_const()
 {
 	if (packed())
 	{
@@ -3333,7 +3333,6 @@ RTLIL::SigSpec& RTLIL::SigSpec::remove_const()
 	}
 
 	check();
-	return *this;
 }
 
 void RTLIL::SigSpec::remove(int offset, int length)
@@ -3429,7 +3428,7 @@ void RTLIL::SigSpec::append_bit(const RTLIL::SigBit &bit)
 	check();
 }
 
-RTLIL::SigSpec& RTLIL::SigSpec::extend_u0(int width, bool is_signed)
+void RTLIL::SigSpec::extend_u0(int width, bool is_signed)
 {
 	cover("kernel.rtlil.sigspec.extend_u0");
 
@@ -3446,7 +3445,6 @@ RTLIL::SigSpec& RTLIL::SigSpec::extend_u0(int width, bool is_signed)
 			append(padding);
 	}
 
-	return *this;
 }
 
 RTLIL::SigSpec RTLIL::SigSpec::repeat(int num) const
diff --git a/kernel/rtlil.h b/kernel/rtlil.h
index 16fd852ba..37b5f984c 100644
--- a/kernel/rtlil.h
+++ b/kernel/rtlil.h
@@ -754,8 +754,8 @@ public:
 	inline int size() const { return width_; }
 	inline bool empty() const { return width_ == 0; }
 
-	inline RTLIL::SigBit &operator[](int index) { inline_unpack(); return index >= 0 ? bits_.at(index) : bits_.at(width_ + index); }
-	inline const RTLIL::SigBit &operator[](int index) const { inline_unpack(); return index >= 0 ? bits_.at(index) : bits_.at(width_ + index); }
+	inline RTLIL::SigBit &operator[](int index) { inline_unpack(); return bits_.at(index); }
+	inline const RTLIL::SigBit &operator[](int index) const { inline_unpack(); return bits_.at(index); }
 
 	inline RTLIL::SigSpecIterator begin() { RTLIL::SigSpecIterator it; it.sig_p = this; it.index = 0; return it; }
 	inline RTLIL::SigSpecIterator end() { RTLIL::SigSpecIterator it; it.sig_p = this; it.index = width_; return it; }
@@ -787,7 +787,7 @@ public:
 	void remove2(const std::set<RTLIL::SigBit> &pattern, RTLIL::SigSpec *other);
 
 	void remove(int offset, int length = 1);
-	RTLIL::SigSpec& remove_const();
+	void remove_const();
 
 	RTLIL::SigSpec extract(const RTLIL::SigSpec &pattern, const RTLIL::SigSpec *other = NULL) const;
 	RTLIL::SigSpec extract(const pool<RTLIL::SigBit> &pattern, const RTLIL::SigSpec *other = NULL) const;
@@ -797,7 +797,7 @@ public:
 	void append(const RTLIL::SigSpec &signal);
 	void append_bit(const RTLIL::SigBit &bit);
 
-	RTLIL::SigSpec& extend_u0(int width, bool is_signed = false);
+	void extend_u0(int width, bool is_signed = false);
 
 	RTLIL::SigSpec repeat(int num) const;
 
diff --git a/passes/pmgen/ice40_dsp.cc b/passes/pmgen/ice40_dsp.cc
index bb45b8a4e..a1a397b83 100644
--- a/passes/pmgen/ice40_dsp.cc
+++ b/passes/pmgen/ice40_dsp.cc
@@ -23,9 +23,10 @@
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
 
-template<class T> bool includes(const T &lhs, const T &rhs) {
+template<class T> inline bool includes(const T &lhs, const T &rhs) {
 	return std::includes(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
 }
+#include <set>
 #include "passes/pmgen/ice40_dsp_pm.h"
 
 void create_ice40_dsp(ice40_dsp_pm &pm)
@@ -37,7 +38,7 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 	log("ffA:    %s\n", log_id(st.ffA, "--"));
 	log("ffB:    %s\n", log_id(st.ffB, "--"));
 	log("mul:    %s\n", log_id(st.mul, "--"));
-	log("ffH:    %s\n", log_id(st.ffH, "--"));
+	log("ffFJKG: %s\n", log_id(st.ffFJKG, "--"));
 	log("addAB:  %s\n", log_id(st.addAB, "--"));
 	log("muxAB:  %s\n", log_id(st.muxAB, "--"));
 	log("ffO_lo: %s\n", log_id(st.ffO_lo, "--"));
@@ -118,8 +119,8 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 		if (st.ffB)
 			log(" ffB:%s", log_id(st.ffB));
 
-		if (st.ffH)
-			log(" ffH:%s", log_id(st.ffH));
+		if (st.ffFJKG)
+			log(" ffFJKG:%s", log_id(st.ffFJKG));
 
 		if (st.ffO_lo)
 			log(" ffO_lo:%s", log_id(st.ffO_lo));
@@ -154,9 +155,9 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 		// If we have a signed multiply-add, then perform sign extension
 		// TODO: Need to check CD[31:16] is sign extension of CD[15:0]?
 		if (st.addAB->getParam("\\A_SIGNED").as_bool() && st.addAB->getParam("\\B_SIGNED").as_bool())
-			pm.module->connect(O[-1], O[-2]);
+			pm.module->connect(O[32], O[31]);
 		else
-			cell->setPort("\\CO", O[-1]);
+			cell->setPort("\\CO", O[32]);
 		O.remove(O_width-1);
 	}
 	else
@@ -205,9 +206,9 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 	cell->setParam("\\C_REG", State::S0);
 	cell->setParam("\\D_REG", State::S0);
 
-	cell->setParam("\\TOP_8x8_MULT_REG", st.ffH ? State::S1 : State::S0);
-	cell->setParam("\\BOT_8x8_MULT_REG", st.ffH ? State::S1 : State::S0);
-	cell->setParam("\\PIPELINE_16x16_MULT_REG1", st.ffH ? State::S1 : State::S0);
+	cell->setParam("\\TOP_8x8_MULT_REG", st.ffFJKG ? State::S1 : State::S0);
+	cell->setParam("\\BOT_8x8_MULT_REG", st.ffFJKG ? State::S1 : State::S0);
+	cell->setParam("\\PIPELINE_16x16_MULT_REG1", st.ffFJKG ? State::S1 : State::S0);
 	cell->setParam("\\PIPELINE_16x16_MULT_REG2", State::S0);
 
 	cell->setParam("\\TOPOUTPUT_SELECT", Const(st.ffO_hi ? 1 : (st.addAB ? 0 : 3), 2));
@@ -228,7 +229,7 @@ void create_ice40_dsp(ice40_dsp_pm &pm)
 		pm.autoremove(st.mul);
 	else
 		pm.blacklist(st.mul);
-	pm.autoremove(st.ffH);
+	pm.autoremove(st.ffFJKG);
 	pm.autoremove(st.addAB);
 	if (st.ffO_lo) {
 			SigSpec O = st.sigO.extract(0,std::min(16,st.ffO_lo->getParam("\\WIDTH").as_int()));
diff --git a/passes/pmgen/ice40_dsp.pmg b/passes/pmgen/ice40_dsp.pmg
index c59c5d20a..11064e072 100644
--- a/passes/pmgen/ice40_dsp.pmg
+++ b/passes/pmgen/ice40_dsp.pmg
@@ -2,6 +2,7 @@ pattern ice40_dsp
 
 state <SigBit> clock
 state <bool> clock_pol
+state <std::set<SigBit>> sigAset sigBset
 state <SigSpec> sigA sigB sigCD sigH sigO sigOused
 state <Cell*> addAB muxAB
 
@@ -10,6 +11,15 @@ match mul
 	select GetSize(mul->getPort(\A)) + GetSize(mul->getPort(\B)) > 10
 endmatch
 
+code sigAset sigBset
+	SigSpec A = port(mul, \A);
+	A.remove_const();
+	sigAset = A.to_sigbit_set();
+	SigSpec B = port(mul, \B);
+	B.remove_const();
+	sigBset = B.to_sigbit_set();
+endcode
+
 code sigH
 	if (mul->type == $mul)
 		sigH = mul->getPort(\Y);
@@ -22,9 +32,9 @@ endcode
 
 match ffA
 	if mul->type != \SB_MAC16 || !param(mul, \A_REG).as_bool()
-	if !port(mul, \A).remove_const().empty()
+	if !sigAset.empty()
 	select ffA->type.in($dff)
-	filter includes(port(ffA, \Q).to_sigbit_set(), port(mul, \A).remove_const().to_sigbit_set())
+	filter includes(port(ffA, \Q).to_sigbit_set(), sigAset)
 	optional
 endmatch
 
@@ -45,9 +55,9 @@ endcode
 
 match ffB
 	if mul->type != \SB_MAC16 || !param(mul, \B_REG).as_bool()
-	if !port(mul, \B).remove_const().empty()
+	if !sigBset.empty()
 	select ffB->type.in($dff)
-	filter includes(port(ffB, \Q).to_sigbit_set(), port(mul, \B).remove_const().to_sigbit_set())
+	filter includes(port(ffB, \Q).to_sigbit_set(), sigBset)
 	optional
 endmatch
 
@@ -72,11 +82,11 @@ code sigB clock clock_pol
 	}
 endcode
 
-match ffH
+match ffFJKG
 	if mul->type != \SB_MAC16 || (!param(mul, \TOP_8x8_MULT_REG).as_bool() && !param(mul, \BOT_8x8_MULT_REG).as_bool() && !param(mul, \PIPELINE_16x16_MULT_REG1).as_bool() && !param(mul, \PIPELINE_16x16_MULT_REG2).as_bool())
-	select ffH->type.in($dff)
-	select nusers(port(ffH, \D)) == 2
-	index <SigSpec> port(ffH, \D) === sigH
+	select ffFJKG->type.in($dff)
+	select nusers(port(ffFJKG, \D)) == 2
+	index <SigSpec> port(ffFJKG, \D) === sigH
 	// Ensure pipeline register is not already used
 	optional
 endmatch
@@ -84,16 +94,16 @@ endmatch
 code sigH sigO clock clock_pol
 	sigO = sigH;
 
-	if (ffH) {
-		sigH = port(ffH, \Q);
+	if (ffFJKG) {
+		sigH = port(ffFJKG, \Q);
 		for (auto b : sigH)
 			if (b.wire->get_bool_attribute(\keep))
 				reject;
 
 		sigO = sigH;
 
-		SigBit c = port(ffH, \CLK).as_bit();
-		bool cp = param(ffH, \CLK_POLARITY).as_bool();
+		SigBit c = port(ffFJKG, \CLK).as_bit();
+		bool cp = param(ffFJKG, \CLK_POLARITY).as_bool();
 
 		if (clock != SigBit() && (c != clock || cp != clock_pol))
 			reject;
@@ -192,18 +202,34 @@ endcode
 match ffO_lo
 	if nusers(sigOused.extract(0,std::min(16,GetSize(sigOused)))) == 2
 	select ffO_lo->type.in($dff)
-	filter includes(port(ffO_lo, \D).to_sigbit_set(), sigOused.extract(0,std::min(16,param(ffO_lo, \WIDTH).as_int())).remove_const().to_sigbit_set())
 	optional
 endmatch
 
+code
+	if (ffO_lo) {
+		SigSpec O = sigOused.extract(0,std::min(16,param(ffO_lo, \WIDTH).as_int()));
+		O.remove_const();
+		if (!includes(port(ffO_lo, \D).to_sigbit_set(), O.to_sigbit_set()))
+			reject;
+	}
+endcode
+
 match ffO_hi
 	if GetSize(sigOused) > 16
 	if nusers(sigOused.extract_end(16)) == 2
 	select ffO_hi->type.in($dff)
-	filter includes(port(ffO_hi, \D).to_sigbit_set(), sigOused.extract_end(16).remove_const().to_sigbit_set())
 	optional
 endmatch
 
+code
+	if (ffO_hi) {
+		SigSpec O = sigOused.extract_end(16);
+		O.remove_const();
+		if (!includes(port(ffO_hi, \D).to_sigbit_set(), O.to_sigbit_set()))
+			reject;
+	}
+endcode
+
 code clock clock_pol sigO sigCD
 	if (ffO_lo || ffO_hi) {
 		if (mul->type == \SB_MAC16) {
diff --git a/passes/pmgen/xilinx_dsp.cc b/passes/pmgen/xilinx_dsp.cc
index cd88f9449..e7b72e312 100644
--- a/passes/pmgen/xilinx_dsp.cc
+++ b/passes/pmgen/xilinx_dsp.cc
@@ -23,22 +23,23 @@
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
 
-template<class T> bool includes(const T &lhs, const T &rhs) {
+template<class T> inline bool includes(const T &lhs, const T &rhs) {
 	return std::includes(lhs.begin(), lhs.end(), rhs.begin(), rhs.end());
 }
+#include <set>
 #include "passes/pmgen/xilinx_dsp_pm.h"
 
-void pack_xilinx_dsp(xilinx_dsp_pm &pm)
+void pack_xilinx_dsp(dict<SigBit, Cell*> &bit_to_driver, xilinx_dsp_pm &pm)
 {
 	auto &st = pm.st_xilinx_dsp;
 
 #if 1
 	log("\n");
-	log("ffA:   %s\n", log_id(st.ffA, "--"));
-	log("ffB:   %s\n", log_id(st.ffB, "--"));
-	log("dsp:   %s\n", log_id(st.dsp, "--"));
-	log("addAB: %s\n", log_id(st.addAB, "--"));
-	log("ffP:   %s\n", log_id(st.ffP, "--"));
+	log("ffA:     %s\n", log_id(st.ffA, "--"));
+	log("ffB:     %s\n", log_id(st.ffB, "--"));
+	log("dsp:     %s\n", log_id(st.dsp, "--"));
+	log("addAB:   %s\n", log_id(st.addAB, "--"));
+	log("ffP:     %s\n", log_id(st.ffP, "--"));
 	//log("muxP:  %s\n", log_id(st.muxP, "--"));
 	log("sigPused: %s\n", log_signal(st.sigPused));
 #endif
@@ -46,11 +47,17 @@ void pack_xilinx_dsp(xilinx_dsp_pm &pm)
 	log("Analysing %s.%s for Xilinx DSP packing.\n", log_id(pm.module), log_id(st.dsp));
 
 	Cell *cell = st.dsp;
+	bit_to_driver.insert(std::make_pair(cell->getPort("\\P")[17], cell));
 	SigSpec P = st.sigP;
 
 	if (st.addAB) {
+		log_assert(st.addAB->getParam("\\A_SIGNED").as_bool());
+		log_assert(st.addAB->getParam("\\B_SIGNED").as_bool());
 		log("  adder %s (%s)\n", log_id(st.addAB), log_id(st.addAB->type));
-		cell->setPort("\\C", st.sigC.extend_u0(48, true));
+
+		SigSpec C = st.sigC;
+		C.extend_u0(48, true);
+		cell->setPort("\\C", C);
 		SigSpec &opmode = cell->connections_.at("\\OPMODE");
 		opmode[6] = State::S0;
 		opmode[5] = State::S1;
@@ -127,8 +134,8 @@ void pack_xilinx_dsp(xilinx_dsp_pm &pm)
 	pm.blacklist(cell);
 }
 
-struct Ice40DspPass : public Pass {
-	Ice40DspPass() : Pass("xilinx_dsp", "Xilinx: pack DSP registers") { }
+struct XilinxDspPass : public Pass {
+	XilinxDspPass() : Pass("xilinx_dsp", "Xilinx: pack DSP registers") { }
 	void help() YS_OVERRIDE
 	{
 		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
@@ -153,9 +160,49 @@ struct Ice40DspPass : public Pass {
 		}
 		extra_args(args, argidx, design);
 
-		for (auto module : design->selected_modules())
-			xilinx_dsp_pm(module, module->selected_cells()).run_xilinx_dsp(pack_xilinx_dsp);
+		for (auto module : design->selected_modules()) {
+			xilinx_dsp_pm pm(module, module->selected_cells());
+			dict<SigBit, Cell*> bit_to_driver;
+			auto f = [&bit_to_driver](xilinx_dsp_pm &pm){ pack_xilinx_dsp(bit_to_driver, pm); };
+			pm.run_xilinx_dsp(f);
+
+			// Look for ability to convert C input from another DSP into PCIN
+			//   NB: Needs to be done after pattern matcher has folded all
+			//       $add cells into the DSP
+			for (auto cell : module->cells()) {
+				if (cell->type != "\\DSP48E1")
+					continue;
+				SigSpec &opmode = cell->connections_.at("\\OPMODE");
+				if (opmode.extract(4,3) != Const::from_string("011"))
+					continue;
+				SigSpec C = pm.sigmap(cell->getPort("\\C"));
+				if (C.has_const())
+					continue;
+				auto it = bit_to_driver.find(C[0]);
+				if (it == bit_to_driver.end())
+					continue;
+				auto driver = it->second;
+
+				// Unextend C
+				int i;
+				for (i = GetSize(C)-1; i > 0; i--)
+					if (C[i] != C[i-1])
+						break;
+				if (i > 48-17)
+					continue;
+				if (driver->getPort("\\P").extract(17, i) == C.extract(0, i)) {
+					cell->setPort("\\C", Const(0, 48));
+					Wire *cascade = module->addWire(NEW_ID, 48);
+					driver->setPort("\\PCOUT", cascade);
+					cell->setPort("\\PCIN", cascade);
+					opmode[6] = State::S1;
+					opmode[5] = State::S0;
+					opmode[4] = State::S1;
+					bit_to_driver.erase(it);
+				}
+			}
+		}
 	}
-} Ice40DspPass;
+} XilinxDspPass;
 
 PRIVATE_NAMESPACE_END
diff --git a/passes/pmgen/xilinx_dsp.pmg b/passes/pmgen/xilinx_dsp.pmg
index 5dee36a11..1a3dcdcbb 100644
--- a/passes/pmgen/xilinx_dsp.pmg
+++ b/passes/pmgen/xilinx_dsp.pmg
@@ -1,6 +1,7 @@
 pattern xilinx_dsp
 
 state <SigBit> clock
+state <std::set<SigBit>> sigAset sigBset
 state <SigSpec> sigC sigP sigPused
 state <Cell*> addAB
 
@@ -8,13 +9,22 @@ match dsp
 	select dsp->type.in(\DSP48E1)
 endmatch
 
+code sigAset sigBset
+	SigSpec A = port(dsp, \A);
+	A.remove_const();
+	sigAset = A.to_sigbit_set();
+	SigSpec B = port(dsp, \B);
+	B.remove_const();
+	sigBset = B.to_sigbit_set();
+endcode
+
 match ffA
 	if param(dsp, \AREG).as_int() == 0
-	if !port(dsp, \A).remove_const().empty()
+	if !sigAset.empty()
 	select ffA->type.in($dff)
 	// DSP48E1 does not support clock inversion
 	select param(ffA, \CLK_POLARITY).as_bool()
-	filter includes(port(ffA, \Q).to_sigbit_set(), port(dsp, \A).remove_const().to_sigbit_set())
+	filter includes(port(ffA, \Q).to_sigbit_set(), sigAset)
 	optional
 endmatch
 
@@ -25,11 +35,11 @@ endcode
 
 match ffB
 	if param(dsp, \BREG).as_int() == 0
-	if !port(dsp, \B).remove_const().empty()
+	if !sigBset.empty()
 	select ffB->type.in($dff)
 	// DSP48E1 does not support clock inversion
 	select param(ffB, \CLK_POLARITY).as_bool()
-	filter includes(port(ffB, \Q).to_sigbit_set(), port(dsp, \B).remove_const().to_sigbit_set())
+	filter includes(port(ffB, \Q).to_sigbit_set(), sigBset)
 	optional
 endmatch
 
@@ -65,21 +75,18 @@ match addB
 	index <int> nusers(port(addB, \B)) === 2
 	//index <SigSpec> port(addB, \B) === sigP.extract(0, param(addB, \B_WIDTH).as_int())
 	filter param(addB, \B_WIDTH).as_int() <= GetSize(sigP)
-	filter port(addB, \B) ==  sigP.extract(0, param(addB, \B_WIDTH).as_int())
+	filter port(addB, \B) == sigP.extract(0, param(addB, \B_WIDTH).as_int())
 	optional
 endmatch
 
 code addAB sigC sigP
-	bool C_SIGNED = false;
 	if (addA) {
 		addAB = addA;
 		sigC = port(addAB, \B);
-		C_SIGNED = param(addAB, \B_SIGNED).as_bool();
 	}
 	if (addB) {
 		addAB = addB;
 		sigC = port(addAB, \A);
-		C_SIGNED = param(addAB, \B_SIGNED).as_bool();
 	}
 	if (addAB) {
 		// Ensure that adder is not used
@@ -97,7 +104,6 @@ code addAB sigC sigP
 		//	reject;
 
 		sigP = port(addAB, \Y);
-                sigC.extend_u0(32, C_SIGNED);
 	}
 endcode
 
diff --git a/techlibs/common/mul2dsp.v b/techlibs/common/mul2dsp.v
index 8e37201e2..75b1242a2 100644
--- a/techlibs/common/mul2dsp.v
+++ b/techlibs/common/mul2dsp.v
@@ -27,20 +27,30 @@
  */
 
 `ifndef DSP_A_MAXWIDTH
-$error("Macro DSP_A_MAXWIDTH must be defined");
+$fatal(1, "Macro DSP_A_MAXWIDTH must be defined");
 `endif
 `ifndef DSP_B_MAXWIDTH
-$error("Macro DSP_B_MAXWIDTH must be defined");
+$fatal(1, "Macro DSP_B_MAXWIDTH must be defined");
+`endif
+`ifndef DSP_B_MAXWIDTH
+$fatal(1, "Macro DSP_B_MAXWIDTH must be defined");
+`endif
+`ifndef DSP_A_MAXWIDTH_PARTIAL
+`define DSP_A_MAXWIDTH_PARTIAL `DSP_A_MAXWIDTH
+`endif
+`ifndef DSP_B_MAXWIDTH_PARTIAL
+`define DSP_B_MAXWIDTH_PARTIAL `DSP_B_MAXWIDTH
 `endif
 
 `ifndef DSP_NAME
-$error("Macro DSP_NAME must be defined");
+$fatal(1, "Macro DSP_NAME must be defined");
 `endif
 
 `define MAX(a,b) (a > b ? a : b)
 `define MIN(a,b) (a < b ? a : b)
 
-module \$mul (A, B, Y); 
+(* techmap_celltype = "$mul $__mul" *)
+module _80_mul (A, B, Y);
 	parameter A_SIGNED = 0;
 	parameter B_SIGNED = 0;
 	parameter A_WIDTH = 1;
@@ -51,12 +61,26 @@ module \$mul (A, B, Y);
 	input [B_WIDTH-1:0] B;
 	output [Y_WIDTH-1:0] Y;
 
+	parameter _TECHMAP_CELLTYPE_ = "";
+
 	generate
-	if (A_SIGNED != B_SIGNED)
+	if (0) begin end
+`ifdef DSP_A_MINWIDTH
+	else if (A_WIDTH < `DSP_A_MINWIDTH)
+		wire _TECHMAP_FAIL_ = 1;
+`endif
+`ifdef DSP_B_MINWIDTH
+	else if (B_WIDTH < `DSP_B_MINWIDTH)
+		wire _TECHMAP_FAIL_ = 1;
+`endif
+`ifdef DSP_Y_MINWIDTH
+	else if (Y_WIDTH < `DSP_Y_MINWIDTH)
+		wire _TECHMAP_FAIL_ = 1;
+`endif
+	else if (_TECHMAP_CELLTYPE_ == "$mul" && A_SIGNED != B_SIGNED)
 		wire _TECHMAP_FAIL_ = 1;
-	// NB: A_SIGNED == B_SIGNED from here
 `ifdef DSP_SIGNEDONLY
-	else if (!A_SIGNED)
+	else if (_TECHMAP_CELLTYPE_ == "$mul" && !A_SIGNED)
 		\$mul #(
 			.A_SIGNED(1),
 			.B_SIGNED(1),
@@ -69,7 +93,7 @@ module \$mul (A, B, Y);
 			.Y(Y)
 		);
 `endif
-	else if (A_WIDTH < B_WIDTH)
+	else if (_TECHMAP_CELLTYPE_ == "$mul" && A_WIDTH < B_WIDTH)
 		\$mul #(
 			.A_SIGNED(B_SIGNED),
 			.B_SIGNED(A_SIGNED),
@@ -81,102 +105,53 @@ module \$mul (A, B, Y);
 			.B(A),
 			.Y(Y)
 		);
-	else
-		\$__mul #(
-			.A_SIGNED(A_SIGNED),
-			.B_SIGNED(B_SIGNED),
-			.A_WIDTH(A_WIDTH),
-			.B_WIDTH(B_WIDTH),
-			.Y_WIDTH(Y_WIDTH)
-		) _TECHMAP_REPLACE_ (
-			.A(A),
-			.B(B),
-			.Y(Y)
-		);
-	endgenerate
-endmodule
-
-module \$__mul (A, B, Y);
-	parameter A_SIGNED = 0;
-	parameter B_SIGNED = 0;
-	parameter A_WIDTH = 1;
-	parameter B_WIDTH = 1;
-	parameter Y_WIDTH = 1;
-
-	input [A_WIDTH-1:0] A;
-	input [B_WIDTH-1:0] B;
-	output [Y_WIDTH-1:0] Y;
-
-	wire [1023:0] _TECHMAP_DO_ = "proc; clean";
+	else begin
+		wire [1023:0] _TECHMAP_DO_ = "proc; clean";
 
 `ifdef DSP_SIGNEDONLY
-	localparam sign_headroom = 1;
+		localparam sign_headroom = 1;
 `else
-	localparam sign_headroom = 0;
+		localparam sign_headroom = 0;
 `endif
 
-	genvar i;
-	generate
-        if (0) begin end
-`ifdef DSP_A_MINWIDTH
-		else if (A_WIDTH < `DSP_A_MINWIDTH)
-			wire _TECHMAP_FAIL_ = 1;
-`endif
-`ifdef DSP_B_MINWIDTH
-		else if (B_WIDTH < `DSP_B_MINWIDTH)
-			wire _TECHMAP_FAIL_ = 1;
-`endif
-`ifdef DSP_Y_MINWIDTH
-		else if (Y_WIDTH < `DSP_Y_MINWIDTH)
-			wire _TECHMAP_FAIL_ = 1;
-`endif
-		else if (A_WIDTH > `DSP_A_MAXWIDTH) begin
-			localparam n = (A_WIDTH+`DSP_A_MAXWIDTH-sign_headroom-1) / (`DSP_A_MAXWIDTH-sign_headroom);
-			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, B_WIDTH+`DSP_A_MAXWIDTH);
-			localparam last_Y_WIDTH = `MIN(partial_Y_WIDTH, B_WIDTH+A_WIDTH-(n-1)*(`DSP_A_MAXWIDTH-sign_headroom));
+		genvar i;
+		if (A_WIDTH > `DSP_A_MAXWIDTH) begin
+			localparam n = (A_WIDTH-`DSP_A_MAXWIDTH+`DSP_A_MAXWIDTH_PARTIAL-sign_headroom-1) / (`DSP_A_MAXWIDTH_PARTIAL-sign_headroom);
+			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, B_WIDTH+`DSP_A_MAXWIDTH_PARTIAL);
+			localparam last_A_WIDTH = A_WIDTH-n*(`DSP_A_MAXWIDTH_PARTIAL-sign_headroom);
+			localparam last_Y_WIDTH = B_WIDTH+last_A_WIDTH;
 			if (A_SIGNED && B_SIGNED) begin
-				wire signed [partial_Y_WIDTH-1:0] partial [n-2:0];
+				wire signed [partial_Y_WIDTH-1:0] partial [n-1:0];
 				wire signed [last_Y_WIDTH-1:0] last_partial;
-				wire signed [Y_WIDTH-1:0] partial_sum [n-1:0];
+				wire signed [Y_WIDTH-1:0] partial_sum [n:0];
 			end
 			else begin
 				wire [partial_Y_WIDTH-1:0] partial [n-1:0];
 				wire [last_Y_WIDTH-1:0] last_partial;
-				wire [Y_WIDTH-1:0] partial_sum [n-1:0];
+				wire [Y_WIDTH-1:0] partial_sum [n:0];
 			end
 
-			\$__mul #(
-				.A_SIGNED(sign_headroom),
-				.B_SIGNED(B_SIGNED),
-				.A_WIDTH(`DSP_A_MAXWIDTH),
-				.B_WIDTH(B_WIDTH),
-				.Y_WIDTH(partial_Y_WIDTH)
-			) mul_slice_first (
-				.A({{sign_headroom{1'b0}}, A[`DSP_A_MAXWIDTH-sign_headroom-1 : 0]}),
-				.B(B),
-				.Y(partial[0])
-			);
-			assign partial_sum[0] = partial[0];
-
-			for (i = 1; i < n-1; i=i+1) begin:slice
+			for (i = 0; i < n; i=i+1) begin:slice
 				\$__mul #(
 					.A_SIGNED(sign_headroom),
 					.B_SIGNED(B_SIGNED),
-					.A_WIDTH(`DSP_A_MAXWIDTH),
+					.A_WIDTH(`DSP_A_MAXWIDTH_PARTIAL),
 					.B_WIDTH(B_WIDTH),
 					.Y_WIDTH(partial_Y_WIDTH)
 				) mul_slice (
-					.A({{sign_headroom{1'b0}}, A[i*(`DSP_A_MAXWIDTH-sign_headroom) +: `DSP_A_MAXWIDTH-sign_headroom]}),
+					.A({{sign_headroom{1'b0}}, A[i*(`DSP_A_MAXWIDTH_PARTIAL-sign_headroom) +: `DSP_A_MAXWIDTH_PARTIAL-sign_headroom]}),
 					.B(B),
 					.Y(partial[i])
 				);
 				// TODO: Currently a 'cascade' approach to summing the partial
 				//       products is taken here, but a more efficient 'binary
 				//       reduction' approach also exists...
-				assign partial_sum[i] = (partial[i] << i*(`DSP_A_MAXWIDTH-sign_headroom)) + partial_sum[i-1];
+				if (i == 0)
+					assign partial_sum[i] = partial[i];
+				else
+					assign partial_sum[i] = (partial[i] << i*(`DSP_A_MAXWIDTH_PARTIAL-sign_headroom)) + partial_sum[i-1];
 			end
 
-			localparam last_A_WIDTH = A_WIDTH-(n-1)*(`DSP_A_MAXWIDTH-sign_headroom);
 			\$__mul #(
 				.A_SIGNED(A_SIGNED),
 				.B_SIGNED(B_SIGNED),
@@ -188,56 +163,46 @@ module \$__mul (A, B, Y);
 				.B(B),
 				.Y(last_partial)
 			);
-			assign partial_sum[n-1] = (last_partial << (n-1)*(`DSP_A_MAXWIDTH-sign_headroom)) + partial_sum[n-2];
-			assign Y = partial_sum[n-1];
+			assign partial_sum[n] = (last_partial << n*(`DSP_A_MAXWIDTH_PARTIAL-sign_headroom)) + partial_sum[n-1];
+			assign Y = partial_sum[n];
 		end
 		else if (B_WIDTH > `DSP_B_MAXWIDTH) begin
-			localparam n = (B_WIDTH+`DSP_B_MAXWIDTH-sign_headroom-1) / (`DSP_B_MAXWIDTH-sign_headroom);
-			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, A_WIDTH+`DSP_B_MAXWIDTH);
-			localparam last_Y_WIDTH = `MIN(partial_Y_WIDTH, A_WIDTH+B_WIDTH-(n-1)*(`DSP_B_MAXWIDTH-sign_headroom));
+			localparam n = (B_WIDTH-`DSP_B_MAXWIDTH+`DSP_B_MAXWIDTH_PARTIAL-sign_headroom-1) / (`DSP_B_MAXWIDTH_PARTIAL-sign_headroom);
+			localparam partial_Y_WIDTH = `MIN(Y_WIDTH, A_WIDTH+`DSP_B_MAXWIDTH_PARTIAL);
+			localparam last_B_WIDTH = B_WIDTH-n*(`DSP_B_MAXWIDTH_PARTIAL-sign_headroom);
+			localparam last_Y_WIDTH = A_WIDTH+last_B_WIDTH;
 			if (A_SIGNED && B_SIGNED) begin
-				wire signed [partial_Y_WIDTH-1:0] partial [n-2:0];
+				wire signed [partial_Y_WIDTH-1:0] partial [n-1:0];
 				wire signed [last_Y_WIDTH-1:0] last_partial;
-				wire signed [Y_WIDTH-1:0] partial_sum [n-1:0];
+				wire signed [Y_WIDTH-1:0] partial_sum [n:0];
 			end
 			else begin
 				wire [partial_Y_WIDTH-1:0] partial [n-1:0];
 				wire [last_Y_WIDTH-1:0] last_partial;
-				wire [Y_WIDTH-1:0] partial_sum [n-1:0];
+				wire [Y_WIDTH-1:0] partial_sum [n:0];
 			end
 
-			\$__mul #(
-				.A_SIGNED(A_SIGNED),
-				.B_SIGNED(sign_headroom),
-				.A_WIDTH(A_WIDTH),
-				.B_WIDTH(`DSP_B_MAXWIDTH),
-				.Y_WIDTH(partial_Y_WIDTH)
-			) mul_first (
-				.A(A),
-				.B({{sign_headroom{1'b0}}, B[`DSP_B_MAXWIDTH-sign_headroom-1 : 0]}),
-				.Y(partial[0])
-			);
-			assign partial_sum[0] = partial[0];
-
-			for (i = 1; i < n-1; i=i+1) begin:slice
+			for (i = 0; i < n; i=i+1) begin:slice
 				\$__mul #(
 					.A_SIGNED(A_SIGNED),
 					.B_SIGNED(sign_headroom),
 					.A_WIDTH(A_WIDTH),
-					.B_WIDTH(`DSP_B_MAXWIDTH),
+					.B_WIDTH(`DSP_B_MAXWIDTH_PARTIAL),
 					.Y_WIDTH(partial_Y_WIDTH)
 				) mul (
 					.A(A),
-					.B({{sign_headroom{1'b0}}, B[i*(`DSP_B_MAXWIDTH-sign_headroom) +: `DSP_B_MAXWIDTH-sign_headroom]}),
+					.B({{sign_headroom{1'b0}}, B[i*(`DSP_B_MAXWIDTH_PARTIAL-sign_headroom) +: `DSP_B_MAXWIDTH_PARTIAL-sign_headroom]}),
 					.Y(partial[i])
 				);
-				// TODO: Currently a 'cascade' approach to summing the partial 
+				// TODO: Currently a 'cascade' approach to summing the partial
 				//       products is taken here, but a more efficient 'binary
 				//       reduction' approach also exists...
-				assign partial_sum[i] = (partial[i] << i*(`DSP_B_MAXWIDTH-sign_headroom)) + partial_sum[i-1];
+				if (i == 0)
+					assign partial_sum[i] = partial[i];
+				else
+					assign partial_sum[i] = (partial[i] << i*(`DSP_B_MAXWIDTH_PARTIAL-sign_headroom)) + partial_sum[i-1];
 			end
 
-			localparam last_B_WIDTH = B_WIDTH-(n-1)*(`DSP_B_MAXWIDTH-sign_headroom);
 			\$__mul #(
 				.A_SIGNED(A_SIGNED),
 				.B_SIGNED(B_SIGNED),
@@ -249,10 +214,10 @@ module \$__mul (A, B, Y);
 				.B(B[B_WIDTH-1 -: last_B_WIDTH]),
 				.Y(last_partial)
 			);
-			assign partial_sum[n-1] = (last_partial << (n-1)*(`DSP_B_MAXWIDTH-sign_headroom)) + partial_sum[n-2];
-			assign Y = partial_sum[n-1];
+			assign partial_sum[n] = (last_partial << n*(`DSP_B_MAXWIDTH_PARTIAL-sign_headroom)) + partial_sum[n-1];
+			assign Y = partial_sum[n];
 		end
-		else begin 
+		else begin
 			if (A_SIGNED)
 				wire signed [`DSP_A_MAXWIDTH-1:0] Aext = $signed(A);
 			else
@@ -274,11 +239,12 @@ module \$__mul (A, B, Y);
 				.Y(Y)
 			);
 		end
+	end
 	endgenerate
 endmodule
 
-(* techmap_celltype = "$__mul" *)
-module $__soft_mul (A, B, Y); 
+(* techmap_celltype = "$mul $__mul" *)
+module _90_soft_mul (A, B, Y);
 	parameter A_SIGNED = 0;
 	parameter B_SIGNED = 0;
 	parameter A_WIDTH = 1;
@@ -292,41 +258,41 @@ module $__soft_mul (A, B, Y);
 	// Indirection necessary since mapping
 	//   back to $mul will cause recursion
 	generate
-		if (A_SIGNED && !B_SIGNED)
-			\$__soft__mul #(
-				.A_SIGNED(A_SIGNED),
-				.B_SIGNED(1),
-				.A_WIDTH(A_WIDTH),
-				.B_WIDTH(B_WIDTH+1),
-				.Y_WIDTH(Y_WIDTH)
-			) _TECHMAP_REPLACE_ (
-				.A(A),
-				.B({1'b0,B}),
-				.Y(Y)
-			);
-		else if (!A_SIGNED && B_SIGNED)
-			\$__soft_mul #(
-				.A_SIGNED(1),
-				.B_SIGNED(B_SIGNED),
-				.A_WIDTH(A_WIDTH+1),
-				.B_WIDTH(B_WIDTH),
-				.Y_WIDTH(Y_WIDTH)
-			) _TECHMAP_REPLACE_ (
-				.A({1'b0,A}),
-				.B(B),
-				.Y(Y)
-			);
-		else
-			\$__soft_mul #(
-				.A_SIGNED(A_SIGNED),
-				.B_SIGNED(B_SIGNED),
-				.A_WIDTH(A_WIDTH),
-				.B_WIDTH(B_WIDTH),
-				.Y_WIDTH(Y_WIDTH)
-			) _TECHMAP_REPLACE_ (
-				.A(A),
-				.B(B),
-				.Y(Y)
-			);
+	if (A_SIGNED && !B_SIGNED)
+		\$__soft_mul #(
+			.A_SIGNED(A_SIGNED),
+			.B_SIGNED(1),
+			.A_WIDTH(A_WIDTH),
+			.B_WIDTH(B_WIDTH+1),
+			.Y_WIDTH(Y_WIDTH)
+		) _TECHMAP_REPLACE_ (
+			.A(A),
+			.B({1'b0,B}),
+			.Y(Y)
+		);
+	else if (!A_SIGNED && B_SIGNED)
+		\$__soft_mul #(
+			.A_SIGNED(1),
+			.B_SIGNED(B_SIGNED),
+			.A_WIDTH(A_WIDTH+1),
+			.B_WIDTH(B_WIDTH),
+			.Y_WIDTH(Y_WIDTH)
+		) _TECHMAP_REPLACE_ (
+			.A({1'b0,A}),
+			.B(B),
+			.Y(Y)
+		);
+	else
+		\$__soft_mul #(
+			.A_SIGNED(A_SIGNED),
+			.B_SIGNED(B_SIGNED),
+			.A_WIDTH(A_WIDTH),
+			.B_WIDTH(B_WIDTH),
+			.Y_WIDTH(Y_WIDTH)
+		) _TECHMAP_REPLACE_ (
+			.A(A),
+			.B(B),
+			.Y(Y)
+		);
 	endgenerate
 endmodule
diff --git a/techlibs/xilinx/cells_sim.v b/techlibs/xilinx/cells_sim.v
index 2731cb454..02ce0d61b 100644
--- a/techlibs/xilinx/cells_sim.v
+++ b/techlibs/xilinx/cells_sim.v
@@ -784,4 +784,6 @@ module DSP48E1 (
         end
     endgenerate
 
+    assign PCOUT = P;
+
 endmodule
diff --git a/techlibs/xilinx/synth_xilinx.cc b/techlibs/xilinx/synth_xilinx.cc
index 546d67337..102c896aa 100644
--- a/techlibs/xilinx/synth_xilinx.cc
+++ b/techlibs/xilinx/synth_xilinx.cc
@@ -288,7 +288,7 @@ struct SynthXilinxPass : public ScriptPass
 		if (check_label("dsp")) {
 			if (!nodsp || help_mode) {
 				// NB: Xilinx multipliers are signed only
-				run("techmap -map +/mul2dsp.v -map +/xilinx/dsp_map.v -D DSP_A_MAXWIDTH=25 -D DSP_B_MAXWIDTH=18 -D DSP_SIGNEDONLY=1 -D DSP_NAME=$__MUL25X18", "(skip if '-nodsp')");
+				run("techmap -map +/mul2dsp.v -map +/xilinx/dsp_map.v -D DSP_A_MAXWIDTH=25 -D DSP_A_MAXWIDTH_PARTIAL=18 -D DSP_B_MAXWIDTH=18 -D DSP_SIGNEDONLY=1 -D DSP_NAME=$__MUL25X18", "(skip if '-nodsp')");
 				run("opt_expr -fine", "                 (skip if '-nodsp')");
 				run("wreduce", "                        (skip if '-nodsp')");
 				run("xilinx_dsp", "                     (skip if '-nodsp')");