diff options
Diffstat (limited to 'passes/pmgen/xilinx_dsp_cascade.pmg')
-rw-r--r-- | passes/pmgen/xilinx_dsp_cascade.pmg | 486 |
1 files changed, 486 insertions, 0 deletions
diff --git a/passes/pmgen/xilinx_dsp_cascade.pmg b/passes/pmgen/xilinx_dsp_cascade.pmg new file mode 100644 index 000000000..b14a1ee0a --- /dev/null +++ b/passes/pmgen/xilinx_dsp_cascade.pmg @@ -0,0 +1,486 @@ +// This file describes the third of three pattern matcher setups that +// forms the `xilinx_dsp` pass described in xilinx_dsp.cc +// At a high level, it works as follows: +// (1) Starting from a DSP48E1 cell that (a) has the Z multiplexer +// (controlled by OPMODE[6:4]) set to zero and (b) doesn't already +// use the 'PCOUT' port +// (2.1) Match another DSP48E1 cell that (a) does not have the CREG enabled, +// (b) has its Z multiplexer output set to the 'C' port, which is +// driven by the 'P' output of the previous DSP cell, and (c) has its +// 'PCIN' port unused +// (2.2) Same as (2.1) but with the 'C' port driven by the 'P' output of the +// previous DSP cell right-shifted by 17 bits +// (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists) +// if (a) the previous DSP48E1 uses either the A2REG or A1REG, (b) this +// DSP48 does not use A2REG nor A1REG, (c) this DSP48E1 does not already +// have an ACOUT -> ACIN cascade, (d) the previous DSP does not already +// use its ACOUT port, then examine if an ACOUT -> ACIN cascade +// opportunity exists by matching for a $dff-with-optional-clock-enable- +// or-reset and checking that the 'D' input of this register is the same +// as the 'A' input of the previous DSP +// (4) Same as (3) but for BCOUT -> BCIN cascade +// (5) Recursively go to (2.1) until no more matches possible, keeping track +// of the longest possible chain found +// (6) The longest chain is then divided into chunks of no more than +// MAX_DSP_CASCADE in length (to prevent long cascades that exceed the +// height of a DSP column) with each DSP in each chunk being rewritten +// to use [ABP]COUT -> [ABP]CIN cascading as appropriate +// Notes: +// - Currently, [AB]COUT -> [AB]COUT cascades (3 or 4) are only considered +// if a PCOUT -> PCIN cascade is (2.1 or 2.2) first identified; this need +// not be the case --- [AB] cascades can exist independently of a P cascade +// (though all three cascades must come from the same DSP). This situation +// is not handled currently. +// - In addition, [AB]COUT -> [AB]COUT cascades (3 or 4) are currently +// conservative in that they examine the situation where (a) the previous +// DSP has [AB]2REG or [AB]1REG enabled, (b) that the downstream DSP has no +// registers enabled, and (c) that there exists only one additional register +// between the upstream and downstream DSPs. This can certainly be relaxed +// to identify situations ranging from (i) neither DSP uses any registers, +// to (ii) upstream DSP has 2 registers, downstream DSP has 2 registers, and +// there exists a further 2 registers between them. This remains a TODO +// item. + +pattern xilinx_dsp_cascade + +udata <std::function<SigSpec(const SigSpec&)>> unextend +udata <vector<std::tuple<Cell*,int,int,int>>> chain longest_chain +state <Cell*> next +state <SigSpec> clock +state <int> AREG BREG + +// Variables used for subpatterns +state <SigSpec> argQ argD +state <bool> ffcepol ffrstpol +state <int> ffoffset +udata <SigSpec> dffD dffQ +udata <SigBit> dffclock +udata <Cell*> dff dffcemux dffrstmux +udata <bool> dffcepol dffrstpol + +code +#define MAX_DSP_CASCADE 20 +endcode + +// (1) Starting from a DSP48* cell that (a) has the Z multiplexer +// (controlled by OPMODE[3:2] for DSP48A*, by OPMODE[6:4] for DSP48E1) +// set to zero and (b) doesn't already use the 'PCOUT' port +match first + select (first->type.in(\DSP48A, \DSP48A1) && port(first, \OPMODE, Const(0, 8)).extract(2,2) == Const::from_string("00")) || (first->type.in(\DSP48E1) && port(first, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("000")) + select nusers(port(first, \PCOUT, SigSpec())) <= 1 +endmatch + +// (6) The longest chain is then divided into chunks of no more than +// MAX_DSP_CASCADE in length (to prevent long cascades that exceed the +// height of a DSP column) with each DSP in each chunk being rewritten +// to use [ABP]COUT -> [ABP]CIN cascading as appropriate +code + longest_chain.clear(); + chain.emplace_back(first, -1, -1, -1); + subpattern(tail); +finally + chain.pop_back(); + log_assert(chain.empty()); + if (GetSize(longest_chain) > 1) { + Cell *dsp = std::get<0>(longest_chain.front()); + + Cell *dsp_pcin; + int P, AREG, BREG; + for (int i = 1; i < GetSize(longest_chain); i++) { + std::tie(dsp_pcin,P,AREG,BREG) = longest_chain[i]; + + if (i % MAX_DSP_CASCADE > 0) { + if (P >= 0) { + Wire *cascade = module->addWire(NEW_ID, 48); + dsp_pcin->setPort(ID(C), Const(0, 48)); + dsp_pcin->setPort(ID(PCIN), cascade); + dsp->setPort(ID(PCOUT), cascade); + add_siguser(cascade, dsp_pcin); + add_siguser(cascade, dsp); + + SigSpec opmode = port(dsp_pcin, \OPMODE, Const(0, 7)); + if (dsp->type.in(\DSP48A, \DSP48A1)) { + log_assert(P == 0); + opmode[3] = State::S0; + opmode[2] = State::S1; + } + else if (dsp->type.in(\DSP48E1)) { + if (P == 17) + opmode[6] = State::S1; + else if (P == 0) + opmode[6] = State::S0; + else log_abort(); + + opmode[5] = State::S0; + opmode[4] = State::S1; + } + dsp_pcin->setPort(\OPMODE, opmode); + + log_debug("PCOUT -> PCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); + } + if (AREG >= 0) { + Wire *cascade = module->addWire(NEW_ID, 30); + dsp_pcin->setPort(ID(A), Const(0, 30)); + dsp_pcin->setPort(ID(ACIN), cascade); + dsp->setPort(ID(ACOUT), cascade); + add_siguser(cascade, dsp_pcin); + add_siguser(cascade, dsp); + + if (dsp->type.in(\DSP48E1)) + dsp->setParam(ID(ACASCREG), AREG); + dsp_pcin->setParam(ID(A_INPUT), Const("CASCADE")); + + log_debug("ACOUT -> ACIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); + } + if (BREG >= 0) { + Wire *cascade = module->addWire(NEW_ID, 18); + if (dsp->type.in(\DSP48A, \DSP48A1)) { + // According to UG389 p9 [https://www.xilinx.com/support/documentation/user_guides/ug389.pdf] + // "The DSP48A1 component uses this input when cascading + // BCOUT from an adjacent DSP48A1 slice. The tools then + // translate BCOUT cascading to the dedicated BCIN input + // and set the B_INPUT attribute for implementation." + dsp_pcin->setPort(ID(B), cascade); + } + else { + dsp_pcin->setPort(ID(B), Const(0, 18)); + dsp_pcin->setPort(ID(BCIN), cascade); + } + dsp->setPort(ID(BCOUT), cascade); + add_siguser(cascade, dsp_pcin); + add_siguser(cascade, dsp); + + if (dsp->type.in(\DSP48E1)) { + dsp->setParam(ID(BCASCREG), BREG); + // According to UG389 p13 [https://www.xilinx.com/support/documentation/user_guides/ug389.pdf] + // "The attribute is only used by place and route tools and + // is not necessary for the users to set for synthesis. The + // attribute is determined by the connection to the B port + // of the DSP48A1 slice. If the B port is connected to the + // BCOUT of another DSP48A1 slice, then the tools automatically + // set the attribute to 'CASCADE', otherwise it is set to + // 'DIRECT'". + dsp_pcin->setParam(ID(B_INPUT), Const("CASCADE")); + } + + log_debug("BCOUT -> BCIN cascade for %s -> %s\n", log_id(dsp), log_id(dsp_pcin)); + } + } + else { + log_debug(" Blocking %s -> %s cascade (exceeds max: %d)\n", log_id(dsp), log_id(dsp_pcin), MAX_DSP_CASCADE); + } + + dsp = dsp_pcin; + } + + accept; + } +endcode + +// ------------------------------------------------------------------ + +subpattern tail +arg first +arg next + +// (2.1) Match another DSP48* cell that (a) does not have the CREG enabled, +// (b) has its Z multiplexer output set to the 'C' port, which is +// driven by the 'P' output of the previous DSP cell, and (c) has its +// 'PCIN' port unused +match nextP + select !param(nextP, \CREG, State::S1).as_bool() + select (nextP->type.in(\DSP48A, \DSP48A1) && port(nextP, \OPMODE, Const(0, 8)).extract(2,2) == Const::from_string("11")) || (nextP->type.in(\DSP48E1) && port(nextP, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011")) + select nusers(port(nextP, \C, SigSpec())) > 1 + select nusers(port(nextP, \PCIN, SigSpec())) == 0 + index <SigBit> port(nextP, \C)[0] === port(std::get<0>(chain.back()), \P)[0] + semioptional +endmatch + +// (2.2) For DSP48E1 only, same as (2.1) but with the 'C' port driven +// by the 'P' output of the previous DSP cell right-shifted by 17 bits +match nextP_shift17 + if !nextP + select nextP_shift17->type.in(\DSP48E1) + select !param(nextP_shift17, \CREG, State::S1).as_bool() + select port(nextP_shift17, \OPMODE, Const(0, 7)).extract(4,3) == Const::from_string("011") + select nusers(port(nextP_shift17, \C, SigSpec())) > 1 + select nusers(port(nextP_shift17, \PCIN, SigSpec())) == 0 + index <SigBit> port(nextP_shift17, \C)[0] === port(std::get<0>(chain.back()), \P)[17] + semioptional +endmatch + +code next + next = nextP; + if (!nextP) + next = nextP_shift17; + if (next) { + if (next->type != first->type) + reject; + unextend = [](const SigSpec &sig) { + int i; + for (i = GetSize(sig)-1; i > 0; i--) + if (sig[i] != sig[i-1]) + break; + // Do not remove non-const sign bit + if (sig[i].wire) + ++i; + return sig.extract(0, i); + }; + } +endcode + +// (3) For this subequent DSP48E1 match (i.e. PCOUT -> PCIN cascade exists) +// if (a) this DSP48E1 does not already have an ACOUT -> ACIN cascade, +// (b) the previous DSP does not already use its ACOUT port, then +// examine if an ACOUT -> ACIN cascade opportunity exists if +// (i) A ports are identical, or (ii) separated by a +// $dff-with-optional-clock-enable-or-reset and checking that the 'D' input +// of this register is the same as the 'A' input of the previous DSP +// TODO: Check for two levels of flops, instead of just one +code argQ clock AREG + AREG = -1; + if (next && next->type.in(\DSP48E1)) { + Cell *prev = std::get<0>(chain.back()); + + if (param(next, \A_INPUT, Const("DIRECT")).decode_string() == "DIRECT" && + port(next, \ACIN, SigSpec()).is_fully_zero() && + nusers(port(prev, \ACOUT, SigSpec())) <= 1) { + if (param(prev, \AREG, 2) == 0) { + if (port(prev, \A) == port(next, \A)) + AREG = 0; + } + else { + argQ = unextend(port(next, \A)); + clock = port(prev, \CLK); + subpattern(in_dffe); + if (dff) { + if (!dffrstmux && port(prev, \RSTA, State::S0) != State::S0) + goto reject_AREG; + if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTA, State::S0)) + goto reject_AREG; + IdString CEA; + if (param(prev, \AREG, 2) == 1) + CEA = \CEA2; + else if (param(prev, \AREG, 2) == 2) + CEA = \CEA1; + else log_abort(); + if (!dffcemux && port(prev, CEA, State::S0) != State::S1) + goto reject_AREG; + if (dffcemux && port(dffcemux, \S) != port(prev, CEA, State::S0)) + goto reject_AREG; + if (dffD == unextend(port(prev, \A))) + AREG = 1; + } + } + } +reject_AREG: ; + } +endcode + +// (4) Same as (3) but for BCOUT -> BCIN cascade +code argQ clock BREG + BREG = -1; + if (next) { + Cell *prev = std::get<0>(chain.back()); + if (param(next, \B_INPUT, Const("DIRECT")).decode_string() == "DIRECT" && + port(next, \BCIN, SigSpec()).is_fully_zero() && + nusers(port(prev, \BCOUT, SigSpec())) <= 1) { + if ((next->type.in(\DSP48A, \DSP48A1) && param(prev, \B0REG, 0) == 0 && param(prev, \B1REG, 1) == 0) || + (next->type.in(\DSP48E1) && param(prev, \BREG, 2) == 0)) { + if (port(prev, \B) == port(next, \B)) + BREG = 0; + } + else { + argQ = unextend(port(next, \B)); + clock = port(prev, \CLK); + subpattern(in_dffe); + if (dff) { + if (!dffrstmux && port(prev, \RSTB, State::S0) != State::S0) + goto reject_BREG; + if (dffrstmux && port(dffrstmux, \S) != port(prev, \RSTB, State::S0)) + goto reject_BREG; + IdString CEB; + if (next->type.in(\DSP48A, \DSP48A1)) + CEB = \CEB; + else if (next->type.in(\DSP48E1)) { + if (param(prev, \BREG, 2) == 1) + CEB = \CEB2; + else if (param(prev, \BREG, 2) == 2) + CEB = \CEB1; + else log_abort(); + } + else log_abort(); + if (!dffcemux && port(prev, CEB, State::S0) != State::S1) + goto reject_BREG; + if (dffcemux && port(dffcemux, \S) != port(prev, CEB, State::S0)) + goto reject_BREG; + if (dffD == unextend(port(prev, \B))) { + if (next->type.in(\DSP48A, \DSP48A1) && param(prev, \B0REG, 0) != 0) + goto reject_BREG; + BREG = 1; + } + } + } + } +reject_BREG: ; + } +endcode + +// (5) Recursively go to (2.1) until no more matches possible, recording the +// longest possible chain +code + if (next) { + chain.emplace_back(next, nextP_shift17 ? 17 : nextP ? 0 : -1, AREG, BREG); + + SigSpec sigC = unextend(port(next, \C)); + + if (nextP_shift17) { + if (GetSize(sigC)+17 <= GetSize(port(std::get<0>(chain.back()), \P)) && + port(std::get<0>(chain.back()), \P).extract(17, GetSize(sigC)) != sigC) + subpattern(tail); + } + else { + if (GetSize(sigC) <= GetSize(port(std::get<0>(chain.back()), \P)) && + port(std::get<0>(chain.back()), \P).extract(0, GetSize(sigC)) != sigC) + subpattern(tail); + + } + } else { + if (GetSize(chain) > GetSize(longest_chain)) + longest_chain = chain; + } +finally + if (next) + chain.pop_back(); +endcode + +// ####################### + +// Subpattern for matching against input registers, based on knowledge of the +// 'Q' input. Typically, identifying registers with clock-enable and reset +// capability would be a task would be handled by other Yosys passes such as +// dff2dffe, but since DSP inference happens much before this, these patterns +// have to be manually identified. +// At a high level: +// (1) Starting from a $dff cell that (partially or fully) drives the given +// 'Q' argument +// (2) Match for a $mux cell implementing synchronous reset semantics --- +// one that exclusively drives the 'D' input of the $dff, with one of its +// $mux inputs being fully zero +// (3) Match for a $mux cell implement clock enable semantics --- one that +// exclusively drives the 'D' input of the $dff (or the other input of +// the reset $mux) and where one of this $mux's inputs is connected to +// the 'Q' output of the $dff +subpattern in_dffe +arg argD argQ clock + +code + dff = nullptr; + for (const auto &c : argQ.chunks()) { + // Abandon matches when 'Q' is a constant + if (!c.wire) + reject; + // Abandon matches when 'Q' has the keep attribute set + if (c.wire->get_bool_attribute(\keep)) + reject; + // Abandon matches when 'Q' has a non-zero init attribute set + // (not supported by DSP48E1) + Const init = c.wire->attributes.at(\init, Const()); + for (auto b : init.extract(c.offset, c.width)) + if (b != State::Sx && b != State::S0) + reject; + } +endcode + +// (1) Starting from a $dff cell that (partially or fully) drives the given +// 'Q' argument +match ff + select ff->type.in($dff) + // DSP48E1 does not support clock inversion + select param(ff, \CLK_POLARITY).as_bool() + + slice offset GetSize(port(ff, \D)) + index <SigBit> port(ff, \Q)[offset] === argQ[0] + + // Check that the rest of argQ is present + filter GetSize(port(ff, \Q)) >= offset + GetSize(argQ) + filter port(ff, \Q).extract(offset, GetSize(argQ)) == argQ + + filter clock == SigBit() || port(ff, \CLK) == clock + + set ffoffset offset +endmatch + +code argQ argD + SigSpec Q = port(ff, \Q); + dff = ff; + dffclock = port(ff, \CLK); + dffD = argQ; + argD = port(ff, \D); + argQ = Q; + dffD.replace(argQ, argD); + // Only search for ffrstmux if dffD only + // has two (ff, ffrstmux) users + if (nusers(dffD) > 2) + argD = SigSpec(); +endcode + +// (2) Match for a $mux cell implementing synchronous reset semantics --- +// exclusively drives the 'D' input of the $dff, with one of the $mux +// inputs being fully zero +match ffrstmux + if !argD.empty() + select ffrstmux->type.in($mux) + index <SigSpec> port(ffrstmux, \Y) === argD + + choice <IdString> BA {\B, \A} + // DSP48E1 only supports reset to zero + select port(ffrstmux, BA).is_fully_zero() + + define <bool> pol (BA == \B) + set ffrstpol pol + semioptional +endmatch + +code argD + if (ffrstmux) { + dffrstmux = ffrstmux; + dffrstpol = ffrstpol; + argD = port(ffrstmux, ffrstpol ? \A : \B); + dffD.replace(port(ffrstmux, \Y), argD); + + // Only search for ffcemux if argQ has at + // least 3 users (ff, <upstream>, ffrstmux) and + // dffD only has two (ff, ffrstmux) + if (!(nusers(argQ) >= 3 && nusers(dffD) == 2)) + argD = SigSpec(); + } + else + dffrstmux = nullptr; +endcode + +// (3) Match for a $mux cell implement clock enable semantics --- one that +// exclusively drives the 'D' input of the $dff (or the other input of +// the reset $mux) and where one of this $mux's inputs is connected to +// the 'Q' output of the $dff +match ffcemux + if !argD.empty() + select ffcemux->type.in($mux) + index <SigSpec> port(ffcemux, \Y) === argD + choice <IdString> AB {\A, \B} + index <SigSpec> port(ffcemux, AB) === argQ + define <bool> pol (AB == \A) + set ffcepol pol + semioptional +endmatch + +code argD + if (ffcemux) { + dffcemux = ffcemux; + dffcepol = ffcepol; + argD = port(ffcemux, ffcepol ? \B : \A); + dffD.replace(port(ffcemux, \Y), argD); + } + else + dffcemux = nullptr; +endcode |