10 files changed, 551 insertions, 171 deletions
diff --git a/passes/cmds/check.cc b/passes/cmds/check.cc
index 64697c134..820ecac7b 100644
--- a/passes/cmds/check.cc
+++ b/passes/cmds/check.cc
@@ -41,14 +41,24 @@ struct CheckPass : public Pass {
 		log("\n");
 		log(" - used wires that do not have a driver\n");
 		log("\n");
-		log("When called with -noinit then this command also checks for wires which have\n");
-		log("the 'init' attribute set.\n");
+		log("Options:\n");
 		log("\n");
-		log("When called with -initdrv then this command also checks for wires which have\n");
-		log("the 'init' attribute set and aren't driven by a FF cell type.\n");
+		log("  -noinit\n");
+		log("    Also check for wires which have the 'init' attribute set.\n");
 		log("\n");
-		log("When called with -assert then the command will produce an error if any\n");
-		log("problems are found in the current design.\n");
+		log("  -initdrv\n");
+		log("    Also check for wires that have the 'init' attribute set and are not\n");
+		log("    driven by an FF cell type.\n");
+		log("\n");
+		log("  -mapped\n");
+		log("    Also check for internal cells that have not been mapped to cells of the\n");
+		log("    target architecture.\n");
+		log("\n");
+		log("  -allow-tbuf\n");
+		log("    Modify the -mapped behavior to still allow $_TBUF_ cells.\n");
+		log("\n");
+		log("  -assert\n");
+		log("    Produce a runtime error if any problems are found in the current design.\n");
 		log("\n");
 	}
 	void execute(std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
@@ -56,6 +66,8 @@ struct CheckPass : public Pass {
 		int counter = 0;
 		bool noinit = false;
 		bool initdrv = false;
+		bool mapped = false;
+		bool allow_tbuf = false;
 		bool assert_mode = false;
 
 		size_t argidx;
@@ -68,6 +80,14 @@ struct CheckPass : public Pass {
 				initdrv = true;
 				continue;
 			}
+			if (args[argidx] == "-mapped") {
+				mapped = true;
+				continue;
+			}
+			if (args[argidx] == "-allow-tbuf") {
+				allow_tbuf = true;
+				continue;
+			}
 			if (args[argidx] == "-assert") {
 				assert_mode = true;
 				continue;
@@ -135,29 +155,37 @@ struct CheckPass : public Pass {
 			TopoSort<string> topo;
 
 			for (auto cell : module->cells())
-			for (auto &conn : cell->connections()) {
-				SigSpec sig = sigmap(conn.second);
-				bool logic_cell = yosys_celltypes.cell_evaluable(cell->type);
-				if (cell->input(conn.first))
-					for (auto bit : sig)
-						if (bit.wire) {
+			{
+				if (mapped && cell->type.begins_with("$") && design->module(cell->type) == nullptr) {
+					if (allow_tbuf && cell->type == ID($_TBUF_)) goto cell_allowed;
+					log_warning("Cell %s.%s is an unmapped internal cell of type %s.\n", log_id(module), log_id(cell), log_id(cell->type));
+					counter++;
+				cell_allowed:;
+				}
+				for (auto &conn : cell->connections()) {
+					SigSpec sig = sigmap(conn.second);
+					bool logic_cell = yosys_celltypes.cell_evaluable(cell->type);
+					if (cell->input(conn.first))
+						for (auto bit : sig)
+							if (bit.wire) {
+								if (logic_cell)
+									topo.edge(stringf("wire %s", log_signal(bit)),
+											stringf("cell %s (%s)", log_id(cell), log_id(cell->type)));
+								used_wires.insert(bit);
+							}
+					if (cell->output(conn.first))
+						for (int i = 0; i < GetSize(sig); i++) {
 							if (logic_cell)
-								topo.edge(stringf("wire %s", log_signal(bit)),
-										stringf("cell %s (%s)", log_id(cell), log_id(cell->type)));
-							used_wires.insert(bit);
+								topo.edge(stringf("cell %s (%s)", log_id(cell), log_id(cell->type)),
+										stringf("wire %s", log_signal(sig[i])));
+							if (sig[i].wire)
+								wire_drivers[sig[i]].push_back(stringf("port %s[%d] of cell %s (%s)",
+										log_id(conn.first), i, log_id(cell), log_id(cell->type)));
 						}
-				if (cell->output(conn.first))
-					for (int i = 0; i < GetSize(sig); i++) {
-						if (logic_cell)
-							topo.edge(stringf("cell %s (%s)", log_id(cell), log_id(cell->type)),
-									stringf("wire %s", log_signal(sig[i])));
-						if (sig[i].wire)
-							wire_drivers[sig[i]].push_back(stringf("port %s[%d] of cell %s (%s)",
-									log_id(conn.first), i, log_id(cell), log_id(cell->type)));
-					}
-				if (!cell->input(conn.first) && cell->output(conn.first))
-					for (auto bit : sig)
-						if (bit.wire) wire_drivers_count[bit]++;
+					if (!cell->input(conn.first) && cell->output(conn.first))
+						for (auto bit : sig)
+							if (bit.wire) wire_drivers_count[bit]++;
+				}
 			}
 
 			pool<SigBit> init_bits;
diff --git a/passes/pmgen/README.md b/passes/pmgen/README.md
index 2f5b8d0b2..39560839f 100644
--- a/passes/pmgen/README.md
+++ b/passes/pmgen/README.md
@@ -190,7 +190,7 @@ create matches for different sections of a cell. For example:
         select pmux->type == $pmux
         slice idx GetSize(port(pmux, \S))
         index <SigBit> port(pmux, \S)[idx] === port(eq, \Y)
-	set pmux_slice idx
+        set pmux_slice idx
     endmatch
 
 The first argument to `slice` is the local variable name used to identify the
diff --git a/passes/pmgen/ice40_wrapcarry.pmg b/passes/pmgen/ice40_wrapcarry.pmg
index 9e64c7467..bb59edb0c 100644
--- a/passes/pmgen/ice40_wrapcarry.pmg
+++ b/passes/pmgen/ice40_wrapcarry.pmg
@@ -9,3 +9,7 @@ match lut
 	index <SigSpec> port(lut, \I1) === port(carry, \I0)
 	index <SigSpec> port(lut, \I2) === port(carry, \I1)
 endmatch
+
+code
+	accept;
+endcode
diff --git a/passes/pmgen/xilinx_dsp.cc b/passes/pmgen/xilinx_dsp.cc
index 11c7e5ea8..054e123e4 100644
--- a/passes/pmgen/xilinx_dsp.cc
+++ b/passes/pmgen/xilinx_dsp.cc
@@ -20,6 +20,7 @@
 
 #include "kernel/yosys.h"
 #include "kernel/sigtools.h"
+#include <deque>
 
 USING_YOSYS_NAMESPACE
 PRIVATE_NAMESPACE_BEGIN
@@ -608,8 +609,13 @@ struct XilinxDspPass : public Pass {
 		extra_args(args, argidx, design);
 
 		for (auto module : design->selected_modules()) {
+			// Experimental feature: pack $add/$sub cells with
+			//   (* use_dsp48="simd" *) into DSP48E1's using its
+			//   SIMD feature
 			xilinx_simd_pack(module, module->selected_cells());
 
+			// Match for all features ([ABDMP][12]?REG, pre-adder,
+			// post-adder, pattern detector, etc.) except for CREG
 			{
 				xilinx_dsp_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_pack(xilinx_dsp_pack);
@@ -618,14 +624,17 @@ struct XilinxDspPass : public Pass {
 			//   is no guarantee that the cell ordering corresponds
 			//   to the "expected" case (i.e. the order in which
 			//   they appear in the source) thus the possiblity
-			//   existed that a register got packed as CREG into a
+			//   existed that a register got packed as a CREG into a
 			//   downstream DSP that should have otherwise been a
-			//   PREG of an upstream DSP that had not been pattern
-			//   matched yet
+			//   PREG of an upstream DSP that had not been visited
+			//   yet
 			{
 				xilinx_dsp_CREG_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_packC(xilinx_dsp_packC);
 			}
+			// Lastly, identify and utilise PCOUT -> PCIN,
+			//   ACOUT -> ACIN, and BCOUT-> BCIN dedicated cascade
+			//   chains
 			{
 				xilinx_dsp_cascade_pm pm(module, module->selected_cells());
 				pm.run_xilinx_dsp_cascade();
diff --git a/passes/pmgen/xilinx_dsp.pmg b/passes/pmgen/xilinx_dsp.pmg
index 3d0b1f2c3..604aa222b 100644
--- a/passes/pmgen/xilinx_dsp.pmg
+++ b/passes/pmgen/xilinx_dsp.pmg
@@ -1,3 +1,57 @@
+// This file describes the main pattern matcher setup (of three total) that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   ( 1) Starting from a DSP48E1 cell
+//   ( 2) Match the driver of the 'A' input to a possible $dff cell (ADREG)
+//        (attached to at most two $mux cells that implement clock-enable or
+//         reset functionality, using a subpattern discussed below)
+//        If ADREG matched, treat 'A' input as input of ADREG
+//   ( 3) Match the driver of the 'A' and 'D' inputs for a possible $add cell
+//       (pre-adder)
+//   ( 4) If pre-adder was present, find match 'A' input for A2REG
+//        If pre-adder was not present, move ADREG to A2REG
+//        If A2REG, then match 'A' input for A1REG
+//   ( 5) Match 'B' input for B2REG
+//        If B2REG, then match 'B' input for B1REG
+//   ( 6) Match 'D' input for DREG
+//   ( 7) Match 'P' output that exclusively drives an MREG
+//   ( 8) Match 'P' output that exclusively drives one of two inputs to an $add
+//        cell (post-adder).
+//        The other input to the adder is assumed to come in from the 'C' input
+//        (note: 'P' -> 'C' connections that exist for accumulators are
+//         recognised in xilinx_dsp.cc).
+//   ( 9) Match 'P' output that exclusively drives a PREG
+//   (10) If post-adder and PREG both present, match for a $mux cell driving
+//        the 'C' input, where one of the $mux's inputs is the PREG output.
+//        This indicates an accumulator situation, and one where a $mux exists
+//        to override the accumulated value:
+//             +--------------------------------+
+//             |   ____                         |
+//             +--|    \                        |
+//                |$mux|-+                      |
+//         'C' ---|____/ |                      |
+//                       | /-------\   +----+   |
+//            +----+     +-| post- |___|PREG|---+ 'P'
+//            |MREG|------ | adder |   +----+
+//            +----+       \-------/
+//   (11) If PREG present, match for a greater-than-or-equal $ge cell attached
+//        to the 'P' output where it is compared to a constant that is a
+//        power-of-2: e.g. `assign overflow = (PREG >= 2**40);`
+//        In this scenario, the pattern detector functionality of a DSP48E1 can
+//        to implement this function
+// Notes:
+//   - The intention of this pattern matcher is for it to be compatible with
+//     DSP48E1 cells inferred from multiply operations by Yosys, as well as for
+//     user instantiations that may already contain the cells being packed...
+//     (though the latter is currently untested)
+//   - Since the $dff-with-optional-clock-enable-or-reset-mux pattern is used
+//     for each *REG match, it has been factored out into two subpatterns:
+//     in_dffe and out_dffe located at the bottom of this file.
+//   - Matching for pattern detector features is currently incomplete. For
+//     example, matching for underflow as well as overflow detection is
+//     possible, as would auto-reset, enabling saturated arithmetic, detecting
+//     custom patterns, etc.
+
 pattern xilinx_dsp_pack
 
 state <SigBit> clock
@@ -5,12 +59,11 @@ state <SigSpec> sigA sigB sigC sigD sigM sigP
 state <IdString> postAddAB postAddMuxAB
 state <bool> ffA1cepol ffA2cepol ffADcepol ffB1cepol ffB2cepol ffDcepol ffMcepol ffPcepol
 state <bool> ffArstpol ffADrstpol ffBrstpol ffDrstpol ffMrstpol ffPrstpol
-
 state <Cell*> ffAD ffADcemux ffADrstmux ffA1 ffA1cemux ffA1rstmux ffA2 ffA2cemux ffA2rstmux
 state <Cell*> ffB1 ffB1cemux ffB1rstmux ffB2 ffB2cemux ffB2rstmux
 state <Cell*> ffD ffDcemux ffDrstmux ffM ffMcemux ffMrstmux ffP ffPcemux ffPrstmux
 
-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@@ -19,6 +72,7 @@ udata <SigBit> dffclock
 udata <Cell*> dff dffcemux dffrstmux
 udata <bool> dffcepol dffrstpol
 
+// (1) Starting from a DSP48E1 cell
 match dsp
 	select dsp->type.in(\DSP48E1)
 endmatch
@@ -50,17 +104,21 @@ code sigA sigB sigC sigD sigM clock
 			sigM.append(P[i]);
 		}
 		log_assert(nusers(P.extract_end(i)) <= 1);
+		// This sigM could have no users if downstream sinks (e.g. $add) is
+		//   narrower than $mul result, for example
+		if (sigM.empty())
+			reject;
 	}
 	else
 		sigM = P;
-	// This sigM could have no users if downstream $add
-	//   is narrower than $mul result, for example
-	if (sigM.empty())
-		reject;
 
 	clock = port(dsp, \CLK, SigBit());
 endcode
 
+// (2) Match the driver of the 'A' input to a possible $dff cell (ADREG)
+//     (attached to at most two $mux cells that implement clock-enable or
+//      reset functionality, using a subpattern discussed above)
+//     If matched, treat 'A' input as input of ADREG
 code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock
 	if (param(dsp, \ADREG).as_int() == 0) {
 		argQ = sigA;
@@ -81,6 +139,8 @@ code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock
 	}
 endcode
 
+// (3) Match the driver of the 'A' and 'D' inputs for a possible $add cell
+//     (pre-adder)
 match preAdd
 	if sigD.empty() || sigD.is_fully_zero()
 	// Ensure that preAdder not already used
@@ -106,11 +166,12 @@ code sigA sigD
 	if (preAdd) {
 		sigA = port(preAdd, \A);
 		sigD = port(preAdd, \B);
-		if (GetSize(sigA) < GetSize(sigD))
-			std::swap(sigA, sigD);
 	}
 endcode
 
+// (4) If pre-adder was present, find match 'A' input for A2REG
+//     If pre-adder was not present, move ADREG to A2REG
+//     Then match 'A' input for A1REG
 code argQ ffAD ffADcemux ffADrstmux ffADcepol ffADrstpol sigA clock ffA2 ffA2cemux ffA2rstmux ffA2cepol ffArstpol ffA1 ffA1cemux ffA1rstmux ffA1cepol
 	// Only search for ffA2 if there was a pre-adder
 	//   (otherwise ffA2 would have been matched as ffAD)
@@ -173,6 +234,8 @@ ffA1_end:		;
 	}
 endcode
 
+// (5) Match 'B' input for B2REG
+//     If B2REG, then match 'B' input for B1REG
 code argQ ffB2 ffB2cemux ffB2rstmux ffB2cepol ffBrstpol sigB clock ffB1 ffB1cemux ffB1rstmux ffB1cepol
 	if (param(dsp, \BREG).as_int() == 0) {
 		argQ = sigB;
@@ -222,6 +285,7 @@ ffB1_end:				;
 	}
 endcode
 
+// (6) Match 'D' input for DREG
 code argQ ffD ffDcemux ffDrstmux ffDcepol ffDrstpol sigD clock
 	if (param(dsp, \DREG).as_int() == 0) {
 		argQ = sigD;
@@ -242,6 +306,7 @@ code argQ ffD ffDcemux ffDrstmux ffDcepol ffDrstpol sigD clock
 	}
 endcode
 
+// (7) Match 'P' output that exclusively drives an MREG
 code argD ffM ffMcemux ffMrstmux ffMcepol ffMrstpol sigM sigP clock
 	if (param(dsp, \MREG).as_int() == 0 && nusers(sigM) == 2) {
 		argD = sigM;
@@ -263,6 +328,11 @@ code argD ffM ffMcemux ffMrstmux ffMcepol ffMrstpol sigM sigP clock
 	sigP = sigM;
 endcode
 
+// (8) Match 'P' output that exclusively drives one of two inputs to an $add
+//     cell (post-adder).
+//     The other input to the adder is assumed to come in from the 'C' input
+//     (note: 'P' -> 'C' connections that exist for accumulators are
+//      recognised in xilinx_dsp.cc).
 match postAdd
 	// Ensure that Z mux is not already used
 	if port(dsp, \OPMODE, SigSpec()).extract(4,3).is_fully_zero()
@@ -277,7 +347,9 @@ match postAdd
 	index <SigBit> port(postAdd, AB)[0] === sigP[0]
 	filter GetSize(port(postAdd, AB)) >= GetSize(sigP)
 	filter port(postAdd, AB).extract(0, GetSize(sigP)) == sigP
-	filter port(postAdd, AB).extract_end(GetSize(sigP)) == SigSpec(sigP[GetSize(sigP)-1], GetSize(port(postAdd, AB))-GetSize(sigP))
+	// Check that remainder of AB is a sign-extension
+	define <bool> AB_SIGNED (param(postAdd, AB == \A ? \A_SIGNED : \B_SIGNED).as_bool())
+	filter port(postAdd, AB).extract_end(GetSize(sigP)) == SigSpec(AB_SIGNED ? sigP[GetSize(sigP)-1] : State::S0, GetSize(port(postAdd, AB))-GetSize(sigP))
 	set postAddAB AB
 	optional
 endmatch
@@ -289,6 +361,7 @@ code sigC sigP
 	}
 endcode
 
+// (9) Match 'P' output that exclusively drives a PREG
 code argD ffP ffPcemux ffPrstmux ffPcepol ffPrstpol sigP clock
 	if (param(dsp, \PREG).as_int() == 0) {
 		int users = 2;
@@ -314,6 +387,19 @@ code argD ffP ffPcemux ffPrstmux ffPcepol ffPrstpol sigP clock
 	}
 endcode
 
+// (10) If post-adder and PREG both present, match for a $mux cell driving
+//      the 'C' input, where one of the $mux's inputs is the PREG output.
+//      This indicates an accumulator situation, and one where a $mux exists
+//      to override the accumulated value:
+//           +--------------------------------+
+//           |   ____                         |
+//           +--|    \                        |
+//              |$mux|-+                      |
+//       'C' ---|____/ |                      |
+//                     | /-------\   +----+   |
+//          +----+     +-| post- |___|PREG|---+ 'P'
+//          |MREG|------ | adder |   +----+
+//          +----+       \-------/
 match postAddMux
 	if postAdd
 	if ffP
@@ -331,6 +417,11 @@ code sigC
 		sigC = port(postAddMux, postAddMuxAB == \A ? \B : \A);
 endcode
 
+// (11) If PREG present, match for a greater-than-or-equal $ge cell attached to
+//      the 'P' output where it is compared to a constant that is a power-of-2:
+//      e.g. `assign overflow = (PREG >= 2**40);`
+//      In this scenario, the pattern detector functionality of a DSP48E1 can
+//      to implement this function
 match overflow
 	if ffP
 	if param(dsp, \USE_PATTERN_DETECT, Const("NO_PATDET")).decode_string() == "NO_PATDET"
@@ -349,22 +440,45 @@ endcode
 
 // #######################
 
+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock
 
 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
 	}
 endcode
 
+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@@ -377,14 +491,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
 
+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch
 
 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@@ -396,9 +508,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode
 
+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@@ -430,6 +544,10 @@ code argD
 		dffrstmux = nullptr;
 endcode
 
+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
@@ -454,16 +572,32 @@ endcode
 
 // #######################
 
+// Subpattern for matching against output registers, based on knowledge of the
+//   'D' input.
+// At a high level:
+//   (1) Starting from an optional $mux cell that implements clock enable
+//       semantics --- one where the given 'D' argument (partially or fully)
+//       drives one of its two inputs
+//   (2) Starting from, or continuing onto, another optional $mux cell that
+//       implements synchronous reset semantics --- one where the given 'D'
+//       argument (or the clock enable $mux output) drives one of its two inputs
+//       and where the other input is fully zero
+//   (3) Match for a $dff cell (whose 'D' input is the 'D' argument, or the
+//       output of the previous clock enable or reset $mux cells)
 subpattern out_dffe
 arg argD argQ clock
 
 code
 	dff = nullptr;
 	for (auto c : argD.chunks())
+		// Abandon matches when 'D' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
 endcode
 
+// (1) Starting from an optional $mux cell that implements clock enable
+//     semantics --- one where the given 'D' argument (partially or fully)
+//     drives one of its two inputs
 match ffcemux
 	select ffcemux->type.in($mux)
 	// ffcemux output must have two users: ffcemux and ff.D
@@ -502,6 +636,10 @@ code argD argQ
 	}
 endcode
 
+// (2) Starting from, or continuing onto, another optional $mux cell that
+//     implements synchronous reset semantics --- one where the given 'D'
+//     argument (or the clock enable $mux output) drives one of its two inputs
+//     and where the other input is fully zero
 match ffrstmux
 	select ffrstmux->type.in($mux)
 	// ffrstmux output must have two users: ffrstmux and ff.D
@@ -540,6 +678,8 @@ code argD argQ
 	}
 endcode
 
+// (3) Match for a $dff cell (whose 'D' input is the 'D' argument, or the
+//     output of the previous clock enable or reset $mux cells)
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@@ -556,32 +696,30 @@ match ff
 	// Check that FF.Q is connected to CE-mux
 	filter !ffcemux || port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
 
+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch
 
 code argQ
-	if (ff) {
-		if (clock != SigBit() && port(ff, \CLK) != clock)
-			reject;
-
-		SigSpec D = port(ff, \D);
-		SigSpec Q = port(ff, \Q);
-		if (!ffcemux) {
-			argQ = argD;
-			argQ.replace(D, Q);
-		}
-
-		for (auto c : argQ.chunks()) {
-			Const init = c.wire->attributes.at(\init, State::Sx);
-			if (!init.is_fully_undef() && !init.is_fully_zero())
-				reject;
-		}
+	SigSpec D = port(ff, \D);
+	SigSpec Q = port(ff, \Q);
+	if (!ffcemux) {
+		argQ = argD;
+		argQ.replace(D, Q);
+	}
 
-		dff = ff;
-		dffQ = argQ;
-		dffclock = port(ff, \CLK);
+	// Abandon matches when 'Q' has a non-zero init attribute set
+	// (not supported by DSP48E1)
+	for (auto c : argQ.chunks()) {
+		Const init = c.wire->attributes.at(\init, Const());
+		if (!init.empty())
+			for (auto b : init.extract(c.offset, c.width))
+				if (b != State::Sx && b != State::S0)
+					reject;
 	}
-	// No enable/reset mux possible without flop
-	else if (dffcemux || dffrstmux)
-		reject;
+
+	dff = ff;
+	dffQ = argQ;
+	dffclock = port(ff, \CLK);
 endcode
diff --git a/passes/pmgen/xilinx_dsp_CREG.pmg b/passes/pmgen/xilinx_dsp_CREG.pmg
index a31dc80bf..a57043009 100644
--- a/passes/pmgen/xilinx_dsp_CREG.pmg
+++ b/passes/pmgen/xilinx_dsp_CREG.pmg
@@ -1,3 +1,26 @@
+// This file describes the second of three pattern matcher setups that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP48E1 cell that (a) doesn't have a CREG already,
+//       and (b) uses the 'C' port
+//   (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//       (attached to at most two $mux cells that implement clock-enable or
+//        reset functionality, using a subpattern discussed below)
+// Notes:
+//   - Running CREG packing after xilinx_dsp_pack is necessary since there is no
+//     guarantee that the cell ordering corresponds to the "expected" case (i.e.
+//     the order in which they appear in the source) thus the possiblity existed
+//     that a register got packed as a CREG into a downstream DSP that should
+//     have otherwise been a PREG of an upstream DSP that had not been visited
+//     yet
+//   - The reason this is separated out from the xilinx_dsp.pmg file is
+//     for efficiency --- each *.pmg file creates a class of the same basename,
+//     which when constructed, creates a custom database tailored to the
+//     pattern(s) contained within. Since the pattern in this file must be
+//     executed after the pattern contained in xilinx_dsp.pmg, it is necessary
+//     to reconstruct this database. Separating the two patterns into
+//     independent files causes two smaller, more specific, databases.
+
 pattern xilinx_dsp_packC
 
 udata <std::function<SigSpec(const SigSpec&)>> unextend
@@ -6,7 +29,7 @@ state <SigSpec> sigC sigP
 state <bool> ffCcepol ffCrstpol
 state <Cell*> ffC ffCcemux ffCrstmux
 
-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@@ -15,13 +38,15 @@ udata <SigBit> dffclock
 udata <Cell*> dff dffcemux dffrstmux
 udata <bool> dffcepol dffrstpol
 
+// (1) Starting from a DSP48E1 cell that (a) doesn't have a CREG already,
+//     and (b) uses the 'C' port
 match dsp
 	select dsp->type.in(\DSP48E1)
 	select param(dsp, \CREG, 1).as_int() == 0
 	select nusers(port(dsp, \C, SigSpec())) > 1
 endmatch
 
-code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC sigP clock
+code sigC sigP clock
 	unextend = [](const SigSpec &sig) {
 		int i;
 		for (i = GetSize(sig)-1; i > 0; i--)
@@ -48,11 +73,13 @@ code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC sigP clock
 	else
 		sigP = P;
 
-	if (sigC == sigP)
-		reject;
-
 	clock = port(dsp, \CLK, SigBit());
+endcode
 
+// (2) Match the driver of the 'C' input to a possible $dff cell (CREG)
+//     (attached to at most two $mux cells that implement clock-enable or
+//      reset functionality, using the in_dffe subpattern)
+code argQ ffC ffCcemux ffCrstmux ffCcepol ffCrstpol sigC clock
 	argQ = sigC;
 	subpattern(in_dffe);
 	if (dff) {
@@ -77,22 +104,44 @@ endcode
 
 // #######################
 
+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock
 
 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		for (auto b : init.extract(c.offset, c.width))
+			if (b != State::Sx && b != State::S0)
+				reject;
 	}
 endcode
 
+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@@ -105,14 +154,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
 
+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch
 
 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@@ -124,9 +171,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode
 
+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@@ -158,6 +207,10 @@ code argD
 		dffrstmux = nullptr;
 endcode
 
+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
diff --git a/passes/pmgen/xilinx_dsp_cascade.pmg b/passes/pmgen/xilinx_dsp_cascade.pmg
index 6f4ac5849..7a32df2b7 100644
--- a/passes/pmgen/xilinx_dsp_cascade.pmg
+++ b/passes/pmgen/xilinx_dsp_cascade.pmg
@@ -1,3 +1,46 @@
+// This file describes the third of three pattern matcher setups that
+//   forms the `xilinx_dsp` pass described in xilinx_dsp.cc
+// At a high level, it works as follows:
+//   (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer
+//       (controlled by OPMODE[6:4]) set to zero and (b) doesn't already
+//       use the 'PCOUT' port
+//   (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled,
+//         (b) has its Z multiplexer output set to the 'C' port, which is
+//         driven by the 'P' output of the previous DSP cell, and (c) has its
+//         'PCIN' port unused
+//   (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the
+//         previous DSP cell right-shifted by 17 bits
+//   (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
+//       if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this
+//       DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already
+//       have an ACOUT -> ACIN cascade, (d) the previous DSP does not already
+//       use its ACOUT port, then examine if an ACOUT -> ACIN cascade
+//       opportunity exists by matching for a $dff-with-optional-clock-enable-
+//       or-reset and checking that the 'D' input of this register is the same
+//       as the 'A' input of the previous DSP
+//   (4) Same as (3) but for BCOUT -> BCIN cascade
+//   (5) Recursively go to (2.1) until no more matches possible, keeping track
+//       of the longest possible chain found
+//   (6) The longest chain is then divided into chunks of no more than
+//       MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//       height of a DSP column) with each DSP in each chunk being rewritten
+//       to use [ABP]COUT -> [ABP]CIN cascading as appropriate
+// Notes:
+//   - Currently, [AB]COUT -> [AB]COUT cascades (3 or 4) are only considered
+//     if a PCOUT -> PCIN cascade is (2.1 or 2.2) first identified; this need
+//     not be the case --- [AB] cascades can exist independently of a P cascade
+//     (though all three cascades must come from the same DSP). This situation
+//     is not handled currently.
+//   - In addition, [AB]COUT -> [AB]COUT cascades (3 or 4) are currently
+//     conservative in that they examine the situation where (a) the previous
+//     DSP has [AB]2REG or [AB]1REG enabled, (b) that the downstream DSP has no
+//     registers enabled, and (c) that there exists only one additional register
+//     between the upstream and downstream DSPs. This can certainly be relaxed
+//     to identify situations ranging from (i) neither DSP uses any registers,
+//     to (ii) upstream DSP has 2 registers, downstream DSP has 2 registers, and
+//     there exists a further 2 registers between them. This remains a TODO
+//     item.
+
 pattern xilinx_dsp_cascade
 
 udata <std::function<SigSpec(const SigSpec&)>> unextend
@@ -6,7 +49,7 @@ state <Cell*> next
 state <SigSpec> clock
 state <int> AREG BREG
 
-// subpattern
+// Variables used for subpatterns
 state <SigSpec> argQ argD
 state <bool> ffcepol ffrstpol
 state <int> ffoffset
@@ -19,12 +62,19 @@ code
 #define MAX_DSP_CASCADE 20
 endcode
 
+// (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer
+//     (controlled by OPMODE[6:4]) set to zero and (b) doesn't already
+//     use the 'PCOUT' port
 match first
 	select first->type.in(\DSP48E1)
 	select port(first, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("000")
 	select nusers(port(first, \PCOUT, SigSpec())) <= 1
 endmatch
 
+// (6) The longest chain is then divided into chunks of no more than
+//     MAX_DSP_CASCADE in length (to prevent long cascades that exceed the
+//     height of a DSP column) with each DSP in each chunk being rewritten
+//     to use [ABP]COUT -> [ABP]CIN cascading as appropriate
 code
 	longest_chain.clear();
 	chain.emplace_back(first, -1, -1, -1);
@@ -106,6 +156,10 @@ subpattern tail
 arg first
 arg next
 
+// (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled,
+//       (b) has its Z multiplexer output set to the 'C' port, which is
+//       driven by the 'P' output of the previous DSP cell, and (c) has its
+//       'PCIN' port unused
 match nextP
 	select nextP->type.in(\DSP48E1)
 	select !param(nextP, \CREG, State::S1).as_bool()
@@ -116,6 +170,8 @@ match nextP
 	semioptional
 endmatch
 
+// (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the
+//       previous DSP cell right-shifted by 17 bits
 match nextP_shift17
 	if !nextP
 	select nextP_shift17->type.in(\DSP48E1)
@@ -145,6 +201,14 @@ code next
 	}
 endcode
 
+// (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists)
+//     if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this
+//     DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already
+//     have an ACOUT -> ACIN cascade, (d) the previous DSP does not already
+//     use its ACOUT port, then examine if an ACOUT -> ACIN cascade
+//     opportunity exists by matching for a $dff-with-optional-clock-enable-
+//     or-reset and checking that the 'D' input of this register is the same
+//     as the 'A' input of the previous DSP
 code argQ clock AREG
 	AREG = -1;
 	if (next) {
@@ -152,7 +216,6 @@ code argQ clock AREG
 		if (param(prev, \AREG, 2).as_int() > 0 &&
 				param(next, \AREG, 2).as_int() > 0 &&
 				param(next, \A_INPUT, Const("DIRECT")).decode_string() == "DIRECT" &&
-				port(next, \ACIN, SigSpec()).is_fully_zero() &&
 				nusers(port(prev, \ACOUT, SigSpec())) <= 1) {
 			argQ = unextend(port(next, \A));
 			clock = port(prev, \CLK);
@@ -174,6 +237,7 @@ reject_AREG:			;
 	}
 endcode
 
+// (4) Same as (3) but for BCOUT -> BCIN cascade
 code argQ clock BREG
 	BREG = -1;
 	if (next) {
@@ -203,13 +267,14 @@ reject_BREG:			;
 	}
 endcode
 
+// (5) Recursively go to (2.1) until no more matches possible, recording the
+//     longest possible chain
 code
 	if (next) {
 		chain.emplace_back(next, nextP_shift17 ? 17 : nextP ? 0 : -1, AREG, BREG);
 
 		SigSpec sigC = unextend(port(next, \C));
 
-		// TODO: Cannot use 'reject' since semioptional
 		if (nextP_shift17) {
 			if (GetSize(sigC)+17 <= GetSize(port(std::get<0>(chain.back()), \P)) &&
 					port(std::get<0>(chain.back()), \P).extract(17, GetSize(sigC)) != sigC)
@@ -232,22 +297,44 @@ endcode
 
 // #######################
 
+// Subpattern for matching against input registers, based on knowledge of the
+//   'Q' input. Typically, identifying registers with clock-enable and reset
+//   capability would be a task would be handled by other Yosys passes such as
+//   dff2dffe, but since DSP inference happens much before this, these patterns
+//   have to be manually identified.
+// At a high level:
+//   (1) Starting from a $dff cell that (partially or fully) drives the given
+//       'Q' argument
+//   (2) Match for a $mux cell implementing synchronous reset semantics ---
+//       one that exclusively drives the 'D' input of the $dff, with one of its
+//       $mux inputs being fully zero
+//   (3) Match for a $mux cell implement clock enable semantics --- one that
+//       exclusively drives the 'D' input of the $dff (or the other input of
+//       the reset $mux) and where one of this $mux's inputs is connected to
+//       the 'Q' output of the $dff
 subpattern in_dffe
 arg argD argQ clock
 
 code
 	dff = nullptr;
-	for (auto c : argQ.chunks()) {
+	for (const auto &c : argQ.chunks()) {
+		// Abandon matches when 'Q' is a constant
 		if (!c.wire)
 			reject;
+		// Abandon matches when 'Q' has the keep attribute set
 		if (c.wire->get_bool_attribute(\keep))
 			reject;
-		Const init = c.wire->attributes.at(\init, State::Sx);
-		if (!init.is_fully_undef() && !init.is_fully_zero())
-			reject;
+		// Abandon matches when 'Q' has a non-zero init attribute set
+		// (not supported by DSP48E1)
+		Const init = c.wire->attributes.at(\init, Const());
+		for (auto b : init.extract(c.offset, c.width))
+			if (b != State::Sx && b != State::S0)
+				reject;
 	}
 endcode
 
+// (1) Starting from a $dff cell that (partially or fully) drives the given
+//     'Q' argument
 match ff
 	select ff->type.in($dff)
 	// DSP48E1 does not support clock inversion
@@ -260,14 +347,12 @@ match ff
 	filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ)
 	filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ
 
+	filter clock == SigBit() || port(ff, \CLK) == clock
+
 	set ffoffset offset
 endmatch
 
 code argQ argD
-{
-	if (clock != SigBit() && port(ff, \CLK) != clock)
-		reject;
-
 	SigSpec Q = port(ff, \Q);
 	dff = ff;
 	dffclock = port(ff, \CLK);
@@ -279,9 +364,11 @@ code argQ argD
 	//   has two (ff, ffrstmux) users
 	if (nusers(dffD) > 2)
 		argD = SigSpec();
-}
 endcode
 
+// (2) Match for a $mux cell implementing synchronous reset semantics ---
+//     exclusively drives the 'D' input of the $dff, with one of the $mux
+//     inputs being fully zero
 match ffrstmux
 	if !argD.empty()
 	select ffrstmux->type.in($mux)
@@ -313,6 +400,10 @@ code argD
 		dffrstmux = nullptr;
 endcode
 
+// (3) Match for a $mux cell implement clock enable semantics --- one that
+//     exclusively drives the 'D' input of the $dff (or the other input of
+//     the reset $mux) and where one of this $mux's inputs is connected to
+//     the 'Q' output of the $dff
 match ffcemux
 	if !argD.empty()
 	select ffcemux->type.in($mux)
diff --git a/passes/techmap/abc9.cc b/passes/techmap/abc9.cc
index 09d6e9670..27106cc5d 100644
--- a/passes/techmap/abc9.cc
+++ b/passes/techmap/abc9.cc
@@ -71,21 +71,21 @@ RTLIL::Module *module;
 bool clk_polarity, en_polarity;
 RTLIL::SigSpec clk_sig, en_sig;
 
-inline std::string remap_name(RTLIL::IdString abc_name)
+inline std::string remap_name(RTLIL::IdString abc9_name)
 {
-	return stringf("$abc$%d$%s", map_autoidx, abc_name.c_str()+1);
+	return stringf("$abc$%d$%s", map_autoidx, abc9_name.c_str()+1);
 }
 
 void handle_loops(RTLIL::Design *design)
 {
-	Pass::call(design, "scc -set_attr abc_scc_id {}");
+	Pass::call(design, "scc -set_attr abc9_scc_id {}");
 
 	// For every unique SCC found, (arbitrarily) find the first
 	// cell in the component, and select (and mark) all its output
 	// wires
 	pool<RTLIL::Const> ids_seen;
 	for (auto cell : module->cells()) {
-		auto it = cell->attributes.find(ID(abc_scc_id));
+		auto it = cell->attributes.find(ID(abc9_scc_id));
 		if (it != cell->attributes.end()) {
 			auto r = ids_seen.insert(it->second);
 			if (r.second) {
@@ -105,7 +105,7 @@ void handle_loops(RTLIL::Design *design)
 							log_assert(w->port_input);
 							log_assert(b.offset < GetSize(w));
 						}
-						w->set_bool_attribute(ID(abc_scc_break));
+						w->set_bool_attribute(ID(abc9_scc_break));
 						module->swap_names(b.wire, w);
 						c.second = RTLIL::SigBit(w, b.offset);
 					}
@@ -118,7 +118,7 @@ void handle_loops(RTLIL::Design *design)
 	module->fixup_ports();
 }
 
-std::string add_echos_to_abc_cmd(std::string str)
+std::string add_echos_to_abc9_cmd(std::string str)
 {
 	std::string new_str, token;
 	for (size_t i = 0; i < str.size(); i++) {
@@ -140,7 +140,7 @@ std::string add_echos_to_abc_cmd(std::string str)
 	return new_str;
 }
 
-std::string fold_abc_cmd(std::string str)
+std::string fold_abc9_cmd(std::string str)
 {
 	std::string token, new_str = "          ";
 	int char_counter = 10;
@@ -184,7 +184,7 @@ std::string replace_tempdir(std::string text, std::string tempdir_name, bool sho
 	return text;
 }
 
-struct abc_output_filter
+struct abc9_output_filter
 {
 	bool got_cr;
 	int escape_seq_state;
@@ -192,7 +192,7 @@ struct abc_output_filter
 	std::string tempdir_name;
 	bool show_tempdir;
 
-	abc_output_filter(std::string tempdir_name, bool show_tempdir) : tempdir_name(tempdir_name), show_tempdir(show_tempdir)
+	abc9_output_filter(std::string tempdir_name, bool show_tempdir) : tempdir_name(tempdir_name), show_tempdir(show_tempdir)
 	{
 		got_cr = false;
 		escape_seq_state = 0;
@@ -247,7 +247,7 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 		bool cleanup, vector<int> lut_costs, bool dff_mode, std::string clk_str,
 		bool /*keepff*/, std::string delay_target, std::string /*lutin_shared*/, bool fast_mode,
 		bool show_tempdir, std::string box_file, std::string lut_file,
-		std::string wire_delay, const dict<int,IdString> &box_lookup
+		std::string wire_delay, const dict<int,IdString> &box_lookup, bool nomfs
 )
 {
 	module = current_module;
@@ -293,68 +293,72 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 	log_header(design, "Extracting gate netlist of module `%s' to `%s/input.xaig'..\n",
 			module->name.c_str(), replace_tempdir(tempdir_name, tempdir_name, show_tempdir).c_str());
 
-	std::string abc_script;
+	std::string abc9_script;
 
 	if (!lut_costs.empty()) {
-		abc_script += stringf("read_lut %s/lutdefs.txt; ", tempdir_name.c_str());
+		abc9_script += stringf("read_lut %s/lutdefs.txt; ", tempdir_name.c_str());
 		if (!box_file.empty())
-			abc_script += stringf("read_box -v %s; ", box_file.c_str());
+			abc9_script += stringf("read_box -v %s; ", box_file.c_str());
 	}
 	else
 	if (!lut_file.empty()) {
-		abc_script += stringf("read_lut %s; ", lut_file.c_str());
+		abc9_script += stringf("read_lut %s; ", lut_file.c_str());
 		if (!box_file.empty())
-			abc_script += stringf("read_box -v %s; ", box_file.c_str());
+			abc9_script += stringf("read_box -v %s; ", box_file.c_str());
 	}
 	else
 		log_abort();
 
-	abc_script += stringf("&read %s/input.xaig; &ps; ", tempdir_name.c_str());
+	abc9_script += stringf("&read %s/input.xaig; &ps; ", tempdir_name.c_str());
 
 	if (!script_file.empty()) {
 		if (script_file[0] == '+') {
 			for (size_t i = 1; i < script_file.size(); i++)
 				if (script_file[i] == '\'')
-					abc_script += "'\\''";
+					abc9_script += "'\\''";
 				else if (script_file[i] == ',')
-					abc_script += " ";
+					abc9_script += " ";
 				else
-					abc_script += script_file[i];
+					abc9_script += script_file[i];
 		} else
-			abc_script += stringf("source %s", script_file.c_str());
+			abc9_script += stringf("source %s", script_file.c_str());
 	} else if (!lut_costs.empty() || !lut_file.empty()) {
 		//bool all_luts_cost_same = true;
 		//for (int this_cost : lut_costs)
 		//	if (this_cost != lut_costs.front())
 		//		all_luts_cost_same = false;
-		abc_script += fast_mode ? ABC_FAST_COMMAND_LUT : ABC_COMMAND_LUT;
+		abc9_script += fast_mode ? ABC_FAST_COMMAND_LUT : ABC_COMMAND_LUT;
 		//if (all_luts_cost_same && !fast_mode)
-		//	abc_script += "; lutpack {S}";
+		//	abc9_script += "; lutpack {S}";
 	} else
 		log_abort();
 
 	//if (script_file.empty() && !delay_target.empty())
-	//	for (size_t pos = abc_script.find("dretime;"); pos != std::string::npos; pos = abc_script.find("dretime;", pos+1))
-	//		abc_script = abc_script.substr(0, pos) + "dretime; retime -o {D};" + abc_script.substr(pos+8);
+	//	for (size_t pos = abc9_script.find("dretime;"); pos != std::string::npos; pos = abc9_script.find("dretime;", pos+1))
+	//		abc9_script = abc9_script.substr(0, pos) + "dretime; retime -o {D};" + abc9_script.substr(pos+8);
 
-	for (size_t pos = abc_script.find("{D}"); pos != std::string::npos; pos = abc_script.find("{D}", pos))
-		abc_script = abc_script.substr(0, pos) + delay_target + abc_script.substr(pos+3);
+	for (size_t pos = abc9_script.find("{D}"); pos != std::string::npos; pos = abc9_script.find("{D}", pos))
+		abc9_script = abc9_script.substr(0, pos) + delay_target + abc9_script.substr(pos+3);
 
-	//for (size_t pos = abc_script.find("{S}"); pos != std::string::npos; pos = abc_script.find("{S}", pos))
-	//	abc_script = abc_script.substr(0, pos) + lutin_shared + abc_script.substr(pos+3);
+	//for (size_t pos = abc9_script.find("{S}"); pos != std::string::npos; pos = abc9_script.find("{S}", pos))
+	//	abc9_script = abc9_script.substr(0, pos) + lutin_shared + abc9_script.substr(pos+3);
 
-	for (size_t pos = abc_script.find("{W}"); pos != std::string::npos; pos = abc_script.find("{W}", pos))
-		abc_script = abc_script.substr(0, pos) + wire_delay + abc_script.substr(pos+3);
+	for (size_t pos = abc9_script.find("{W}"); pos != std::string::npos; pos = abc9_script.find("{W}", pos))
+		abc9_script = abc9_script.substr(0, pos) + wire_delay + abc9_script.substr(pos+3);
 
-	abc_script += stringf("; &write %s/output.aig", tempdir_name.c_str());
-	abc_script = add_echos_to_abc_cmd(abc_script);
+	if (nomfs)
+		for (size_t pos = abc9_script.find("&mfs"); pos != std::string::npos; pos = abc9_script.find("&mfs", pos))
+			abc9_script = abc9_script.erase(pos, strlen("&mfs"));
 
-	for (size_t i = 0; i+1 < abc_script.size(); i++)
-		if (abc_script[i] == ';' && abc_script[i+1] == ' ')
-			abc_script[i+1] = '\n';
+	abc9_script += stringf("; &write %s/output.aig", tempdir_name.c_str());
+	abc9_script = add_echos_to_abc9_cmd(abc9_script);
+
+	for (size_t i = 0; i+1 < abc9_script.size(); i++)
+		if (abc9_script[i] == ';' && abc9_script[i+1] == ' ')
+			abc9_script[i+1] = '\n';
 
 	FILE *f = fopen(stringf("%s/abc.script", tempdir_name.c_str()).c_str(), "wt");
-	fprintf(f, "%s\n", abc_script.c_str());
+	fprintf(f, "%s\n", abc9_script.c_str());
 	fclose(f);
 
 	if (dff_mode || !clk_str.empty())
@@ -420,7 +424,7 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 		// the expose operation -- remove them from PO/PI
 		// and re-connecting them back together
 		for (auto wire : module->wires()) {
-			auto it = wire->attributes.find(ID(abc_scc_break));
+			auto it = wire->attributes.find(ID(abc9_scc_break));
 			if (it != wire->attributes.end()) {
 				wire->attributes.erase(it);
 				log_assert(wire->port_output);
@@ -450,22 +454,22 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 		log("Running ABC command: %s\n", replace_tempdir(buffer, tempdir_name, show_tempdir).c_str());
 
 #ifndef YOSYS_LINK_ABC
-		abc_output_filter filt(tempdir_name, show_tempdir);
-		int ret = run_command(buffer, std::bind(&abc_output_filter::next_line, filt, std::placeholders::_1));
+		abc9_output_filter filt(tempdir_name, show_tempdir);
+		int ret = run_command(buffer, std::bind(&abc9_output_filter::next_line, filt, std::placeholders::_1));
 #else
 		// These needs to be mutable, supposedly due to getopt
-		char *abc_argv[5];
+		char *abc9_argv[5];
 		string tmp_script_name = stringf("%s/abc.script", tempdir_name.c_str());
-		abc_argv[0] = strdup(exe_file.c_str());
-		abc_argv[1] = strdup("-s");
-		abc_argv[2] = strdup("-f");
-		abc_argv[3] = strdup(tmp_script_name.c_str());
-		abc_argv[4] = 0;
-		int ret = Abc_RealMain(4, abc_argv);
-		free(abc_argv[0]);
-		free(abc_argv[1]);
-		free(abc_argv[2]);
-		free(abc_argv[3]);
+		abc9_argv[0] = strdup(exe_file.c_str());
+		abc9_argv[1] = strdup("-s");
+		abc9_argv[2] = strdup("-f");
+		abc9_argv[3] = strdup(tmp_script_name.c_str());
+		abc9_argv[4] = 0;
+		int ret = Abc_RealMain(4, abc9_argv);
+		free(abc9_argv[0]);
+		free(abc9_argv[1]);
+		free(abc9_argv[2]);
+		free(abc9_argv[3]);
 #endif
 		if (ret != 0)
 			log_error("ABC: execution of command \"%s\" failed: return code %d.\n", buffer.c_str(), ret);
@@ -513,7 +517,7 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 			signal = std::move(bits);
 		}
 
-		dict<IdString, bool> abc_box;
+		dict<IdString, bool> abc9_box;
 		vector<RTLIL::Cell*> boxes;
 		for (const auto &it : module->cells_) {
 			auto cell = it.second;
@@ -521,10 +525,10 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 				module->remove(cell);
 				continue;
 			}
-			auto jt = abc_box.find(cell->type);
-			if (jt == abc_box.end()) {
+			auto jt = abc9_box.find(cell->type);
+			if (jt == abc9_box.end()) {
 				RTLIL::Module* box_module = design->module(cell->type);
-				jt = abc_box.insert(std::make_pair(cell->type, box_module && box_module->attributes.count(ID(abc_box_id)))).first;
+				jt = abc9_box.insert(std::make_pair(cell->type, box_module && box_module->attributes.count(ID(abc9_box_id)))).first;
 			}
 			if (jt->second)
 				boxes.emplace_back(cell);
@@ -648,7 +652,7 @@ void abc9_module(RTLIL::Design *design, RTLIL::Module *current_module, std::stri
 					if (!conn.second.is_wire())
 						continue;
 					Wire *wire = conn.second.as_wire();
-					if (!wire->get_bool_attribute(ID(abc_padding)))
+					if (!wire->get_bool_attribute(ID(abc9_padding)))
 						continue;
 					cell->unsetPort(conn.first);
 					log_debug("Dropping padded port connection for %s (%s) .%s (%s )\n", log_id(cell), cell->type.c_str(), log_id(conn.first), log_signal(conn.second));
@@ -827,17 +831,17 @@ struct Abc9Pass : public Pass {
 		log("        if no -script parameter is given, the following scripts are used:\n");
 		log("\n");
 		log("        for -lut/-luts (only one LUT size):\n");
-		log("%s\n", fold_abc_cmd(ABC_COMMAND_LUT /*"; lutpack {S}"*/).c_str());
+		log("%s\n", fold_abc9_cmd(ABC_COMMAND_LUT /*"; lutpack {S}"*/).c_str());
 		log("\n");
 		log("        for -lut/-luts (different LUT sizes):\n");
-		log("%s\n", fold_abc_cmd(ABC_COMMAND_LUT).c_str());
+		log("%s\n", fold_abc9_cmd(ABC_COMMAND_LUT).c_str());
 		log("\n");
 		log("    -fast\n");
 		log("        use different default scripts that are slightly faster (at the cost\n");
 		log("        of output quality):\n");
 		log("\n");
 		log("        for -lut/-luts:\n");
-		log("%s\n", fold_abc_cmd(ABC_FAST_COMMAND_LUT).c_str());
+		log("%s\n", fold_abc9_cmd(ABC_FAST_COMMAND_LUT).c_str());
 		log("\n");
 		log("    -D <picoseconds>\n");
 		log("        set delay target. the string {D} in the default scripts above is\n");
@@ -921,6 +925,7 @@ struct Abc9Pass : public Pass {
 		std::string delay_target, lutin_shared = "-S 1", wire_delay;
 		bool fast_mode = false, dff_mode = false, keepff = false, cleanup = true;
 		bool show_tempdir = false;
+		bool nomfs = false;
 		vector<int> lut_costs;
 		markgroups = false;
 
@@ -1043,6 +1048,10 @@ struct Abc9Pass : public Pass {
 				wire_delay = "-W " + args[++argidx];
 				continue;
 			}
+			if (arg == "-nomfs") {
+				nomfs = true;
+				continue;
+			}
 			break;
 		}
 		extra_args(args, argidx, design);
@@ -1057,7 +1066,7 @@ struct Abc9Pass : public Pass {
 
 		dict<int,IdString> box_lookup;
 		for (auto m : design->modules()) {
-			auto it = m->attributes.find(ID(abc_box_id));
+			auto it = m->attributes.find(ID(abc9_box_id));
 			if (it == m->attributes.end())
 				continue;
 			if (m->name.begins_with("$paramod"))
@@ -1065,7 +1074,7 @@ struct Abc9Pass : public Pass {
 			auto id = it->second.as_int();
 			auto r = box_lookup.insert(std::make_pair(id, m->name));
 			if (!r.second)
-				log_error("Module '%s' has the same abc_box_id = %d value as '%s'.\n",
+				log_error("Module '%s' has the same abc9_box_id = %d value as '%s'.\n",
 						log_id(m), id, log_id(r.first->second));
 			log_assert(r.second);
 
@@ -1073,24 +1082,24 @@ struct Abc9Pass : public Pass {
 			for (auto p : m->ports) {
 				auto w = m->wire(p);
 				log_assert(w);
-				if (w->attributes.count(ID(abc_carry))) {
+				if (w->attributes.count(ID(abc9_carry))) {
 					if (w->port_input) {
 						if (carry_in)
-							log_error("Module '%s' contains more than one 'abc_carry' input port.\n", log_id(m));
+							log_error("Module '%s' contains more than one 'abc9_carry' input port.\n", log_id(m));
 						carry_in = w;
 					}
 					else if (w->port_output) {
 						if (carry_out)
-							log_error("Module '%s' contains more than one 'abc_carry' input port.\n", log_id(m));
+							log_error("Module '%s' contains more than one 'abc9_carry' input port.\n", log_id(m));
 						carry_out = w;
 					}
 				}
 			}
 			if (carry_in || carry_out) {
 				if (carry_in && !carry_out)
-					log_error("Module '%s' contains an 'abc_carry' input port but no output port.\n", log_id(m));
+					log_error("Module '%s' contains an 'abc9_carry' input port but no output port.\n", log_id(m));
 				if (!carry_in && carry_out)
-					log_error("Module '%s' contains an 'abc_carry' output port but no input port.\n", log_id(m));
+					log_error("Module '%s' contains an 'abc9_carry' output port but no input port.\n", log_id(m));
 				// Make carry_in the last PI, and carry_out the last PO
 				//   since ABC requires it this way
 				auto &ports = m->ports;
@@ -1118,7 +1127,7 @@ struct Abc9Pass : public Pass {
 
 		for (auto mod : design->selected_modules())
 		{
-			if (mod->attributes.count(ID(abc_box_id)))
+			if (mod->attributes.count(ID(abc9_box_id)))
 				continue;
 
 			if (mod->processes.size() > 0) {
@@ -1131,7 +1140,7 @@ struct Abc9Pass : public Pass {
 			if (!dff_mode || !clk_str.empty()) {
 				abc9_module(design, mod, script_file, exe_file, cleanup, lut_costs, dff_mode, clk_str, keepff,
 						delay_target, lutin_shared, fast_mode, show_tempdir,
-						box_file, lut_file, wire_delay, box_lookup);
+						box_file, lut_file, wire_delay, box_lookup, nomfs);
 				continue;
 			}
 
@@ -1277,7 +1286,7 @@ struct Abc9Pass : public Pass {
 				en_sig = assign_map(std::get<3>(it.first));
 				abc9_module(design, mod, script_file, exe_file, cleanup, lut_costs, !clk_sig.empty(), "$",
 						keepff, delay_target, lutin_shared, fast_mode, show_tempdir,
-						box_file, lut_file, wire_delay, box_lookup);
+						box_file, lut_file, wire_delay, box_lookup, nomfs);
 				assign_map.set(mod);
 			}
 		}
diff --git a/passes/techmap/aigmap.cc b/passes/techmap/aigmap.cc
index 1d5e1286b..2ecb2f35a 100644
--- a/passes/techmap/aigmap.cc
+++ b/passes/techmap/aigmap.cc
@@ -27,6 +27,7 @@ struct AigmapPass : public Pass {
 	AigmapPass() : Pass("aigmap", "map logic to and-inverter-graph circuit") { }
 	void help() YS_OVERRIDE
 	{
+		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
 		log("\n");
 		log("    aigmap [options] [selection]\n");
 		log("\n");
@@ -36,10 +37,15 @@ struct AigmapPass : public Pass {
 		log("    -nand\n");
 		log("        Enable creation of $_NAND_ cells\n");
 		log("\n");
+		log("    -select\n");
+		log("        Overwrite replaced cells in the current selection with new $_AND_,\n");
+		log("        $_NOT_, and $_NAND_, cells\n");
+
+		log("\n");
 	}
 	void execute(std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
 	{
-		bool nand_mode = false;
+		bool nand_mode = false, select_mode = false;
 
 		log_header(design, "Executing AIGMAP pass (map logic to AIG).\n");
 
@@ -50,6 +56,10 @@ struct AigmapPass : public Pass {
 				nand_mode = true;
 				continue;
 			}
+			if (args[argidx] == "-select") {
+				select_mode = true;
+				continue;
+			}
 			break;
 		}
 		extra_args(args, argidx, design);
@@ -62,6 +72,7 @@ struct AigmapPass : public Pass {
 			dict<IdString, int> stat_not_replaced;
 			int orig_num_cells = GetSize(module->cells());
 
+			pool<IdString> new_sel;
 			for (auto cell : module->selected_cells())
 			{
 				Aig aig(cell);
@@ -75,6 +86,8 @@ struct AigmapPass : public Pass {
 				if (aig.name.empty()) {
 					not_replaced_count++;
 					stat_not_replaced[cell->type]++;
+					if (select_mode)
+						new_sel.insert(cell->name);
 					continue;
 				}
 
@@ -95,19 +108,33 @@ struct AigmapPass : public Pass {
 						SigBit A = sigs.at(node.left_parent);
 						SigBit B = sigs.at(node.right_parent);
 						if (nand_mode && node.inverter) {
-							bit = module->NandGate(NEW_ID, A, B);
+							bit = module->addWire(NEW_ID);
+							auto gate = module->addNandGate(NEW_ID, A, B, bit);
+							if (select_mode)
+								new_sel.insert(gate->name);
+
 							goto skip_inverter;
 						} else {
 							pair<int, int> key(node.left_parent, node.right_parent);
 							if (and_cache.count(key))
 								bit = and_cache.at(key);
-							else
-								bit = module->AndGate(NEW_ID, A, B);
+							else {
+								bit = module->addWire(NEW_ID);
+								auto gate = module->addAndGate(NEW_ID, A, B, bit);
+								if (select_mode)
+									new_sel.insert(gate->name);
+							}
 						}
 					}
 
-					if (node.inverter)
-						bit = module->NotGate(NEW_ID, bit);
+					if (node.inverter) {
+						SigBit new_bit = module->addWire(NEW_ID);
+						auto gate = module->addNotGate(NEW_ID, bit, new_bit);
+						bit = new_bit;
+						if (select_mode)
+							new_sel.insert(gate->name);
+
+					}
 
 				skip_inverter:
 					for (auto &op : node.outports)
@@ -142,6 +169,13 @@ struct AigmapPass : public Pass {
 
 			for (auto cell : replaced_cells)
 				module->remove(cell);
+
+			if (select_mode) {
+				log_assert(!design->selection_stack.empty());
+				RTLIL::Selection& sel = design->selection_stack.back();
+				sel.selected_members[module->name] = std::move(new_sel);
+			}
+
 		}
 	}
 } AigmapPass;
diff --git a/passes/techmap/techmap.cc b/passes/techmap/techmap.cc
index 08a1af2d5..0c57733d4 100644
--- a/passes/techmap/techmap.cc
+++ b/passes/techmap/techmap.cc
@@ -257,6 +257,12 @@ struct TechmapWorker
 					w->add_strpool_attribute(ID(src), extra_src_attrs);
 			}
 			design->select(module, w);
+
+			if (it.second->name.begins_with("\\_TECHMAP_REPLACE_.")) {
+				IdString replace_name = stringf("%s%s", orig_cell_name.c_str(), it.second->name.c_str() + strlen("\\_TECHMAP_REPLACE_"));
+				Wire *replace_w = module->addWire(replace_name, it.second);
+				module->connect(replace_w, w);
+			}
 		}
 
 		SigMap tpl_sigmap(tpl);
@@ -378,6 +384,8 @@ struct TechmapWorker
 
 			if (techmap_replace_cell)
 				c_name = orig_cell_name;
+			else if (it.second->name.begins_with("\\_TECHMAP_REPLACE_."))
+				c_name = stringf("%s%s", orig_cell_name.c_str(), c_name.c_str() + strlen("\\_TECHMAP_REPLACE_"));
 			else
 				apply_prefix(cell->name, c_name);
 
@@ -1198,6 +1206,12 @@ struct TechmapPass : public Pass {
 		log("\n");
 		log("A cell with the name _TECHMAP_REPLACE_ in the map file will inherit the name\n");
 		log("and attributes of the cell that is being replaced.\n");
+		log("A cell with a name of the form `_TECHMAP_REPLACE_.<suffix>` in the map file will\n");
+		log("be named thus but with the `_TECHMAP_REPLACE_' prefix substituted with the name\n");
+		log("of the cell being replaced.\n");
+		log("Similarly, a wire named in the form `_TECHMAP_REPLACE_.<suffix>` will cause a\n");
+		log("new wire alias to be created and named as above but with the `_TECHMAP_REPLACE_'\n");
+		log("prefix also substituted.\n");
 		log("\n");
 		log("See 'help extract' for a pass that does the opposite thing.\n");
 		log("\n");