1 files changed, 969 insertions, 0 deletions
diff --git a/mistral/lab.cc b/mistral/lab.cc
new file mode 100644
index 00000000..abd0fec3
--- /dev/null
+++ b/mistral/lab.cc
@@ -0,0 +1,969 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "design_utils.h"
+#include "log.h"
+#include "nextpnr.h"
+#include "util.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+// This file contains functions related to our custom LAB structure, including creating the LAB bels; checking the
+// legality of LABs; and manipulating LUT inputs and equations
+
+// LAB/ALM structure creation functions
+namespace {
+static void create_alm(Arch *arch, int x, int y, int z, uint32_t lab_idx)
+{
+    auto &lab = arch->labs.at(lab_idx);
+    auto &alm = lab.alms.at(z);
+    // Create the combinational part of ALMs.
+    // There are two of these, for the two LUT outputs, and these also contain the carry chain and associated logic
+    // Each one has all 8 ALM inputs as input pins. In many cases only a subset of these are used; depending on mode;
+    // and the bel-cell pin mappings are used to handle this post-placement without losing flexibility
+    for (int i = 0; i < 2; i++) {
+        // Carry/share wires are a bit tricky due to all the different permutations
+        WireId carry_in, share_in;
+        WireId carry_out, share_out;
+        if (z == 0 && i == 0) {
+            carry_in = arch->add_wire(x, y, id_CI);
+            share_in = arch->add_wire(x, y, id_SHAREIN);
+            if (y < (arch->getGridDimY() - 1)) {
+                // Carry is split at tile boundary (TTO_DIS bit), add a PIP to represent this.
+                // TODO: what about BTO_DIS, in the middle of the LAB?
+                arch->add_pip(arch->add_wire(x, y + 1, id_CO), carry_in);
+                arch->add_pip(arch->add_wire(x, y + 1, id_SHAREOUT), share_in);
+            }
+        } else {
+            // Output from last combinational unit
+            carry_in = arch->add_wire(x, y, arch->id(stringf("CARRY[%d]", (z * 2 + i) - 1)));
+            share_in = arch->add_wire(x, y, arch->id(stringf("SHARE[%d]", (z * 2 + i) - 1)));
+        }
+
+        if (z == 9 && i == 1) {
+            carry_out = arch->add_wire(x, y, id_CO);
+            share_out = arch->add_wire(x, y, id_SHAREOUT);
+        } else {
+            carry_out = arch->add_wire(x, y, arch->id(stringf("CARRY[%d]", z * 2 + i)));
+            share_out = arch->add_wire(x, y, arch->id(stringf("SHARE[%d]", z * 2 + i)));
+        }
+
+        BelId bel = arch->add_bel(x, y, arch->id(stringf("ALM%d_COMB%d", z, i)), id_MISTRAL_COMB);
+        // LUT/MUX inputs
+        arch->add_bel_pin(bel, id_A, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::A));
+        arch->add_bel_pin(bel, id_B, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::B));
+        arch->add_bel_pin(bel, id_C, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::C));
+        arch->add_bel_pin(bel, id_D, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::D));
+        arch->add_bel_pin(bel, id_E0, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::E0));
+        arch->add_bel_pin(bel, id_E1, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::E1));
+        arch->add_bel_pin(bel, id_F0, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::F0));
+        arch->add_bel_pin(bel, id_F1, PORT_IN, arch->get_port(CycloneV::LAB, x, y, z, CycloneV::F1));
+        // Carry/share chain
+        arch->add_bel_pin(bel, id_CI, PORT_IN, carry_in);
+        arch->add_bel_pin(bel, id_SHAREIN, PORT_IN, share_in);
+        arch->add_bel_pin(bel, id_CO, PORT_OUT, carry_out);
+        arch->add_bel_pin(bel, id_SHAREOUT, PORT_OUT, share_out);
+        // Combinational output
+        alm.comb_out[i] = arch->add_wire(x, y, arch->id(stringf("COMBOUT[%d]", z * 2 + i)));
+        arch->add_bel_pin(bel, id_COMBOUT, PORT_OUT, alm.comb_out[i]);
+        // Assign indexing
+        alm.lut_bels.at(i) = bel;
+        auto &b = arch->bel_data(bel);
+        b.lab_data.lab = lab_idx;
+        b.lab_data.alm = z;
+        b.lab_data.idx = i;
+    }
+    // Create the control set and E/F selection - which is per pair of FF
+    for (int i = 0; i < 2; i++) {
+        // Wires
+        alm.sel_clk[i] = arch->add_wire(x, y, arch->id(stringf("CLK%c[%d]", i ? 'B' : 'T', z)));
+        alm.sel_ena[i] = arch->add_wire(x, y, arch->id(stringf("ENA%c[%d]", i ? 'B' : 'T', z)));
+        alm.sel_aclr[i] = arch->add_wire(x, y, arch->id(stringf("ACLR%c[%d]", i ? 'B' : 'T', z)));
+        alm.sel_ef[i] = arch->add_wire(x, y, arch->id(stringf("%cEF[%d]", i ? 'B' : 'T', z)));
+        // Muxes - three CLK/ENA per LAB, two ACLR
+        for (int j = 0; j < 3; j++) {
+            arch->add_pip(lab.clk_wires[j], alm.sel_clk[i]);
+            arch->add_pip(lab.ena_wires[j], alm.sel_ena[i]);
+            if (j < 2)
+                arch->add_pip(lab.aclr_wires[j], alm.sel_aclr[i]);
+        }
+        // E/F pips
+        // Note that the F choice is mirrored, F from the other half is picked
+        arch->add_pip(arch->get_port(CycloneV::LAB, x, y, z, i ? CycloneV::E1 : CycloneV::E0), alm.sel_ef[i]);
+        arch->add_pip(arch->get_port(CycloneV::LAB, x, y, z, i ? CycloneV::F0 : CycloneV::F1), alm.sel_ef[i]);
+    }
+
+    // Create the flipflops and associated routing
+    const CycloneV::port_type_t outputs[4] = {CycloneV::FFT0, CycloneV::FFT1, CycloneV::FFB0, CycloneV::FFB1};
+    const CycloneV::port_type_t l_outputs[4] = {CycloneV::FFT1L, CycloneV::FFB1L};
+
+    for (int i = 0; i < 4; i++) {
+        // FF input, selected by *PKREG*
+        alm.ff_in[i] = arch->add_wire(x, y, arch->id(stringf("FFIN[%d]", (z * 4) + i)));
+        arch->add_pip(alm.comb_out[i / 2], alm.ff_in[i]);
+        arch->add_pip(alm.sel_ef[i / 2], alm.ff_in[i]);
+        // FF bel
+        BelId bel = arch->add_bel(x, y, arch->id(stringf("ALM%d_FF%d", z, i)), id_MISTRAL_FF);
+        arch->add_bel_pin(bel, id_CLK, PORT_IN, alm.sel_clk[i / 2]);
+        arch->add_bel_pin(bel, id_ENA, PORT_IN, alm.sel_ena[i / 2]);
+        arch->add_bel_pin(bel, id_ACLR, PORT_IN, alm.sel_aclr[i / 2]);
+        arch->add_bel_pin(bel, id_SCLR, PORT_IN, lab.sclr_wire);
+        arch->add_bel_pin(bel, id_SLOAD, PORT_IN, lab.sload_wire);
+        arch->add_bel_pin(bel, id_DATAIN, PORT_IN, alm.ff_in[i]);
+        arch->add_bel_pin(bel, id_SDATA, PORT_IN, alm.sel_ef[i / 2]);
+
+        // FF output
+        alm.ff_out[i] = arch->add_wire(x, y, arch->id(stringf("FFOUT[%d]", (z * 4) + i)));
+        arch->add_bel_pin(bel, id_Q, PORT_OUT, alm.ff_out[i]);
+        // Output mux (*DFF*)
+        WireId out = arch->get_port(CycloneV::LAB, x, y, z, outputs[i]);
+        arch->add_pip(alm.ff_out[i], out);
+        arch->add_pip(alm.comb_out[i / 2], out);
+        // 'L' output mux where applicable
+        if (i == 1 || i == 3) {
+            WireId l_out = arch->get_port(CycloneV::LAB, x, y, z, l_outputs[i / 2]);
+            arch->add_pip(alm.ff_out[i], l_out);
+            arch->add_pip(alm.comb_out[i / 2], l_out);
+        }
+
+        lab.alms.at(z).ff_bels.at(i) = bel;
+        auto &b = arch->bel_data(bel);
+        b.lab_data.lab = lab_idx;
+        b.lab_data.alm = z;
+        b.lab_data.idx = i;
+    }
+}
+} // namespace
+
+void Arch::create_lab(int x, int y)
+{
+    uint32_t lab_idx = labs.size();
+    labs.emplace_back();
+
+    auto &lab = labs.back();
+
+    // Create common control set configuration. This is actually a subset of what's possible, but errs on the side of
+    // caution due to incomplete documentation
+
+    // Clocks - hardcode to CLKA choices, as both CLKA and CLKB coming from general routing causes unexpected
+    // permutations
+    for (int i = 0; i < 3; i++) {
+        lab.clk_wires[i] = add_wire(x, y, id(stringf("CLK%d", i)));
+        add_pip(get_port(CycloneV::LAB, x, y, -1, CycloneV::CLKIN, 0), lab.clk_wires[i]);  // dedicated routing
+        add_pip(get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 0), lab.clk_wires[i]); // general routing
+    }
+
+    // Enables - while it looks from the config like there are choices for these, it seems like EN0_SEL actually selects
+    // SCLR not ENA0 and EN1_SEL actually selects SLOAD?
+    lab.ena_wires[0] = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 2);
+    lab.ena_wires[1] = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 3);
+    lab.ena_wires[2] = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 0);
+
+    // ACLRs - only consider general routing for now
+    lab.aclr_wires[0] = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 3);
+    lab.aclr_wires[1] = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 2);
+
+    // SCLR and SLOAD - as above it seems like these might be selectable using the "EN*_SEL" bits but play it safe for
+    // now
+    lab.sclr_wire = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 3);
+    lab.sload_wire = get_port(CycloneV::LAB, x, y, -1, CycloneV::DATAIN, 1);
+
+    for (int i = 0; i < 10; i++) {
+        create_alm(this, x, y, i, lab_idx);
+    }
+}
+
+// Cell handling and annotation functions
+namespace {
+ControlSig get_ctrlsig(const Context *ctx, const CellInfo *cell, IdString port, bool explicit_const = false)
+{
+    ControlSig result;
+    result.net = get_net_or_empty(cell, port);
+    if (result.net == nullptr && explicit_const) {
+        // For ENA, 1 (and 0) are explicit control set choices even though they aren't routed, as "no ENA" still
+        // consumes a clock+ENA pair
+        CellPinState st = PIN_1;
+        result.net = ctx->nets.at((st == PIN_1) ? ctx->id("$PACKER_VCC_NET") : ctx->id("$PACKER_GND_NET")).get();
+    }
+    if (cell->pin_data.count(port))
+        result.inverted = cell->pin_data.at(port).state == PIN_INV;
+    else
+        result.inverted = false;
+    return result;
+}
+} // namespace
+
+bool Arch::is_comb_cell(IdString cell_type) const
+{
+    // Return true if a cell is a combinational cell type, to be a placed at a MISTRAL_COMB location
+    switch (cell_type.index) {
+    case ID_MISTRAL_ALUT6:
+    case ID_MISTRAL_ALUT5:
+    case ID_MISTRAL_ALUT4:
+    case ID_MISTRAL_ALUT3:
+    case ID_MISTRAL_ALUT2:
+    case ID_MISTRAL_NOT:
+    case ID_MISTRAL_CONST:
+    case ID_MISTRAL_ALUT_ARITH:
+        return true;
+    default:
+        return false;
+    }
+}
+
+void Arch::assign_comb_info(CellInfo *cell) const
+{
+    cell->combInfo.is_carry = false;
+    cell->combInfo.is_shared = false;
+    cell->combInfo.is_extended = false;
+    cell->combInfo.carry_start = false;
+    cell->combInfo.carry_end = false;
+    cell->combInfo.chain_shared_input_count = 0;
+
+    if (cell->type == id_MISTRAL_ALUT_ARITH) {
+        cell->combInfo.is_carry = true;
+        cell->combInfo.lut_input_count = 5;
+        cell->combInfo.lut_bits_count = 32;
+
+        // This is a special case in terms of naming
+        const std::array<IdString, 5> arith_pins{id_A, id_B, id_C, id_D0, id_D1};
+        {
+            int i = 0;
+            for (auto pin : arith_pins) {
+                cell->combInfo.lut_in[i++] = get_net_or_empty(cell, pin);
+            }
+        }
+
+        const NetInfo *ci = get_net_or_empty(cell, id_CI);
+        const NetInfo *co = get_net_or_empty(cell, id_CO);
+
+        cell->combInfo.comb_out = get_net_or_empty(cell, id_SO);
+        cell->combInfo.carry_start = (ci == nullptr) || (ci->driver.cell == nullptr);
+        cell->combInfo.carry_end = (co == nullptr) || (co->users.empty());
+
+        // Compute cross-ALM routing sharing - only check the z=0 case inside ALMs
+        if (cell->constr_z > 0 && ((cell->constr_z % 2) == 0) && ci) {
+            const CellInfo *prev = ci->driver.cell;
+            if (prev != nullptr) {
+                for (int i = 0; i < 5; i++) {
+                    const NetInfo *a = get_net_or_empty(cell, arith_pins[i]);
+                    if (a == nullptr)
+                        continue;
+                    const NetInfo *b = get_net_or_empty(prev, arith_pins[i]);
+                    if (a == b)
+                        ++cell->combInfo.chain_shared_input_count;
+                }
+            }
+        }
+
+    } else {
+        cell->combInfo.lut_input_count = 0;
+        switch (cell->type.index) {
+        case ID_MISTRAL_ALUT6:
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[5] = get_net_or_empty(cell, id_F);
+            [[fallthrough]];
+        case ID_MISTRAL_ALUT5:
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[4] = get_net_or_empty(cell, id_E);
+            [[fallthrough]];
+        case ID_MISTRAL_ALUT4:
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[3] = get_net_or_empty(cell, id_D);
+            [[fallthrough]];
+        case ID_MISTRAL_ALUT3:
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[2] = get_net_or_empty(cell, id_C);
+            [[fallthrough]];
+        case ID_MISTRAL_ALUT2:
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[1] = get_net_or_empty(cell, id_B);
+            [[fallthrough]];
+        case ID_MISTRAL_BUF: // used to route through to FFs etc
+        case ID_MISTRAL_NOT: // used for inverters that map to LUTs
+            ++cell->combInfo.lut_input_count;
+            cell->combInfo.lut_in[0] = get_net_or_empty(cell, id_A);
+            [[fallthrough]];
+        case ID_MISTRAL_CONST:
+            // MISTRAL_CONST is a nextpnr-inserted cell type for 0-input, constant-generating LUTs
+            break;
+        default:
+            log_error("unexpected combinational cell type %s\n", getCtx()->nameOf(cell->type));
+        }
+        // Note that this relationship won't hold for extended mode, when that is supported
+        cell->combInfo.lut_bits_count = (1 << cell->combInfo.lut_input_count);
+    }
+    cell->combInfo.used_lut_input_count = 0;
+    for (int i = 0; i < cell->combInfo.lut_input_count; i++)
+        if (cell->combInfo.lut_in[i])
+            ++cell->combInfo.used_lut_input_count;
+}
+
+void Arch::assign_ff_info(CellInfo *cell) const
+{
+    cell->ffInfo.ctrlset.clk = get_ctrlsig(getCtx(), cell, id_CLK);
+    cell->ffInfo.ctrlset.ena = get_ctrlsig(getCtx(), cell, id_ENA, true);
+    cell->ffInfo.ctrlset.aclr = get_ctrlsig(getCtx(), cell, id_ACLR);
+    cell->ffInfo.ctrlset.sclr = get_ctrlsig(getCtx(), cell, id_SCLR);
+    cell->ffInfo.ctrlset.sload = get_ctrlsig(getCtx(), cell, id_SLOAD);
+    // If SCLR is used, but SLOAD isn't, then it seems like we need to pretend as if SLOAD is connected GND (so set
+    // [BT]SLOAD_EN inside the ALMs, and clear SLOAD_INV)
+    if (cell->ffInfo.ctrlset.sclr.net != nullptr && cell->ffInfo.ctrlset.sload.net == nullptr) {
+        cell->ffInfo.ctrlset.sload.net = nets.at(id("$PACKER_GND_NET")).get();
+        cell->ffInfo.ctrlset.sload.inverted = false;
+    }
+
+    cell->ffInfo.sdata = get_net_or_empty(cell, id_SDATA);
+    cell->ffInfo.datain = get_net_or_empty(cell, id_DATAIN);
+}
+
+// Validity checking functions
+bool Arch::is_alm_legal(uint32_t lab, uint8_t alm) const
+{
+    auto &alm_data = labs.at(lab).alms.at(alm);
+    // Get cells into an array for fast access
+    std::array<const CellInfo *, 2> luts{getBoundBelCell(alm_data.lut_bels[0]), getBoundBelCell(alm_data.lut_bels[1])};
+    std::array<const CellInfo *, 4> ffs{getBoundBelCell(alm_data.ff_bels[0]), getBoundBelCell(alm_data.ff_bels[1]),
+                                        getBoundBelCell(alm_data.ff_bels[2]), getBoundBelCell(alm_data.ff_bels[3])};
+    int used_lut_bits = 0;
+
+    int total_lut_inputs = 0;
+    // TODO: for more complex modes like extended/arithmetic, it might not always be possible for any LUT input to map
+    // to any of the ALM half inputs particularly shared and extended mode will need more thought and probably for this
+    // to be revisited
+    for (int i = 0; i < 2; i++) {
+        if (!luts[i])
+            continue;
+        total_lut_inputs += luts[i]->combInfo.lut_input_count;
+        used_lut_bits += luts[i]->combInfo.lut_bits_count;
+    }
+    // An ALM only has 64 bits of storage. In theory some of these cases might be legal because of overlap between the
+    // two functions, but the current placer is unlikely to stumble upon these cases frequently without anything to
+    // guide it, and the cost of checking them here almost certainly outweighs any marginal benefit in supporting them,
+    // at least for now.
+    if (used_lut_bits > 64)
+        return false;
+
+    if (total_lut_inputs > 8) {
+        NPNR_ASSERT(luts[0] && luts[1]); // something has gone badly wrong if this fails!
+        // Make sure that LUT inputs are not overprovisioned
+        int shared_lut_inputs = 0;
+        // Even though this N^2 search looks inefficient, it's unlikely a set lookup or similar is going to be much
+        // better given the low N.
+        for (int i = 0; i < luts[1]->combInfo.lut_input_count; i++) {
+            const NetInfo *sig = luts[1]->combInfo.lut_in[i];
+            for (int j = 0; j < luts[0]->combInfo.lut_input_count; j++) {
+                if (sig == luts[0]->combInfo.lut_in[j]) {
+                    ++shared_lut_inputs;
+                    break;
+                }
+            }
+        }
+        if ((total_lut_inputs - shared_lut_inputs) > 8)
+            return false;
+    }
+
+    bool carry_mode = false;
+
+    // No mixing of carry and non-carry
+    if (luts[0] && luts[1] && luts[0]->combInfo.is_carry != luts[1]->combInfo.is_carry)
+        return false;
+
+    // For each ALM half; check FF control set sharing and input routeability
+    for (int i = 0; i < 2; i++) {
+        // There are two ways to route from the fabric into FF data - either routing through a LUT or using the E/F
+        // signals and SLOAD=1 (*PKREF*)
+        bool route_thru_lut_avail = !luts[i] && !carry_mode && (total_lut_inputs < 8) && (used_lut_bits < 64);
+        // E/F is available if this LUT is using 3 or fewer inputs - this is conservative and sharing can probably
+        // improve this situation. (1 - i) because the F input to EF_SEL is mirrored.
+        bool ef_available = (!luts[1 - i] || (luts[1 - i]->combInfo.used_lut_input_count <= 2));
+        // Control set checking
+        bool found_ff = false;
+
+        FFControlSet ctrlset;
+        for (int j = 0; j < 2; j++) {
+            const CellInfo *ff = ffs[i * 2 + j];
+            if (!ff)
+                continue;
+            if (j == 1)
+                return false; // TODO: why are these FFs broken?
+            if (found_ff) {
+                // Two FFs in the same half with an incompatible control set
+                if (ctrlset != ff->ffInfo.ctrlset)
+                    return false;
+            } else {
+                ctrlset = ff->ffInfo.ctrlset;
+            }
+            // SDATA must use the E/F input
+            // TODO: rare case of two FFs with the same SDATA in the same ALM half
+            if (ff->ffInfo.sdata) {
+                if (!ef_available)
+                    return false;
+                ef_available = false;
+            }
+            // Find a way of routing the input through fabric, if it's not driven by the LUT
+            if (ff->ffInfo.datain && (!luts[i] || (ff->ffInfo.datain != luts[i]->combInfo.comb_out))) {
+                if (route_thru_lut_avail)
+                    route_thru_lut_avail = false;
+                else if (ef_available)
+                    ef_available = false;
+                else
+                    return false;
+            }
+            found_ff = true;
+        }
+    }
+
+    return true;
+}
+
+void Arch::update_alm_input_count(uint32_t lab, uint8_t alm)
+{
+    // TODO: duplication with above
+    auto &alm_data = labs.at(lab).alms.at(alm);
+    // Get cells into an array for fast access
+    std::array<const CellInfo *, 2> luts{getBoundBelCell(alm_data.lut_bels[0]), getBoundBelCell(alm_data.lut_bels[1])};
+    std::array<const CellInfo *, 4> ffs{getBoundBelCell(alm_data.ff_bels[0]), getBoundBelCell(alm_data.ff_bels[1]),
+                                        getBoundBelCell(alm_data.ff_bels[2]), getBoundBelCell(alm_data.ff_bels[3])};
+    int total_inputs = 0;
+    int total_lut_inputs = 0;
+    for (int i = 0; i < 2; i++) {
+        if (!luts[i])
+            continue;
+        total_lut_inputs += luts[i]->combInfo.used_lut_input_count - luts[i]->combInfo.chain_shared_input_count;
+    }
+    int shared_lut_inputs = 0;
+    if (luts[0] && luts[1]) {
+        for (int i = 0; i < luts[1]->combInfo.lut_input_count; i++) {
+            const NetInfo *sig = luts[1]->combInfo.lut_in[i];
+            if (!sig)
+                continue;
+            for (int j = 0; j < luts[0]->combInfo.lut_input_count; j++) {
+                if (sig == luts[0]->combInfo.lut_in[j]) {
+                    ++shared_lut_inputs;
+                    break;
+                }
+            }
+            if (shared_lut_inputs >= 2) {
+                // only 2 inputs have guaranteed sharing, without routeability based LUT permutation at least
+                break;
+            }
+        }
+    }
+    total_inputs = std::max(0, total_lut_inputs - shared_lut_inputs);
+    for (int i = 0; i < 4; i++) {
+        const CellInfo *ff = ffs[i];
+        if (!ff)
+            continue;
+        if (ff->ffInfo.sdata)
+            ++total_inputs;
+        // FF input doesn't consume routing resources if driven by associated LUT
+        if (ff->ffInfo.datain && (!luts[i / 2] || ff->ffInfo.datain != luts[i / 2]->combInfo.comb_out))
+            ++total_inputs;
+    }
+    alm_data.unique_input_count = total_inputs;
+}
+
+bool Arch::check_lab_input_count(uint32_t lab) const
+{
+    // There are only 46 TD signals available to route signals from general routing to the ALM input. Currently, we
+    // check the total sum of ALM inputs is less than 42; 46 minus 4 FF control inputs. This is a conservative check for
+    // several reasons, because LD signals are also available for feedback routing from ALM output to input, and because
+    // TD signals may be shared if the same net routes to multiple ALMs. But these cases will need careful handling and
+    // LUT permutation during routing to be useful; and in any event conservative LAB packing will help nextpnr's
+    // currently perfunctory place and route algorithms to achieve satisfactory runtimes.
+    int count = 0;
+    auto &lab_data = labs.at(lab);
+    for (int i = 0; i < 10; i++) {
+        count += lab_data.alms.at(i).unique_input_count;
+    }
+    return (count <= 42);
+}
+
+namespace {
+bool check_assign_sig(ControlSig &sig_set, const ControlSig &sig)
+{
+    if (sig.net == nullptr) {
+        return true;
+    } else if (sig_set == sig) {
+        return true;
+    } else if (sig_set.net == nullptr) {
+        sig_set = sig;
+        return true;
+    } else {
+        return false;
+    }
+};
+
+template <size_t N> bool check_assign_sig(std::array<ControlSig, N> &sig_set, const ControlSig &sig)
+{
+    if (sig.net == nullptr)
+        return true;
+    for (size_t i = 0; i < N; i++)
+        if (sig_set[i] == sig) {
+            return true;
+        } else if (sig_set[i].net == nullptr) {
+            sig_set[i] = sig;
+            return true;
+        }
+    return false;
+};
+
+// DATAIN mapping rules - which LAB DATAIN signals can be used for ENA and ACLR
+static constexpr std::array<int, 3> ena_datain{2, 3, 0};
+static constexpr std::array<int, 2> aclr_datain{3, 2};
+
+struct LabCtrlSetWorker
+{
+
+    ControlSig clk{}, sload{}, sclr{};
+    std::array<ControlSig, 2> aclr{};
+    std::array<ControlSig, 3> ena{};
+
+    std::array<ControlSig, 4> datain{};
+
+    bool run(const Arch *arch, uint32_t lab)
+    {
+        // Strictly speaking the constraint is up to 2 unique CLK and 3 CLK+ENA pairs. For now we simplify this to 1 CLK
+        // and 3 ENA though.
+        for (uint8_t alm = 0; alm < 10; alm++) {
+            for (uint8_t i = 0; i < 4; i++) {
+                const CellInfo *ff = arch->getBoundBelCell(arch->labs.at(lab).alms.at(alm).ff_bels.at(i));
+                if (ff == nullptr)
+                    continue;
+
+                if (!check_assign_sig(clk, ff->ffInfo.ctrlset.clk))
+                    return false;
+                if (!check_assign_sig(sload, ff->ffInfo.ctrlset.sload))
+                    return false;
+                if (!check_assign_sig(sclr, ff->ffInfo.ctrlset.sclr))
+                    return false;
+                if (!check_assign_sig(aclr, ff->ffInfo.ctrlset.aclr))
+                    return false;
+                if (!check_assign_sig(ena, ff->ffInfo.ctrlset.ena))
+                    return false;
+            }
+        }
+        // Check for overuse of the shared, LAB-wide datain signals
+        if (clk.net != nullptr && !clk.net->is_global)
+            if (!check_assign_sig(datain[0], clk)) // CLK only needs DATAIN[0] if it's not global
+                return false;
+        if (!check_assign_sig(datain[1], sload))
+            return false;
+        if (!check_assign_sig(datain[3], sclr))
+            return false;
+        for (const auto &aclr_sig : aclr) {
+            // Check both possibilities that ACLR can map to
+            // TODO: ACLR could be global, too
+            if (check_assign_sig(datain[aclr_datain[0]], aclr_sig))
+                continue;
+            if (check_assign_sig(datain[aclr_datain[1]], aclr_sig))
+                continue;
+            // Failed to find any free ACLR-capable DATAIN
+            return false;
+        }
+        for (const auto &ena_sig : ena) {
+            // Check all 3 possibilities that ACLR can map to
+            // TODO: ACLR could be global, too
+            if (check_assign_sig(datain[ena_datain[0]], ena_sig))
+                continue;
+            if (check_assign_sig(datain[ena_datain[1]], ena_sig))
+                continue;
+            if (check_assign_sig(datain[ena_datain[2]], ena_sig))
+                continue;
+            // Failed to find any free ENA-capable DATAIN
+            return false;
+        }
+        return true;
+    }
+};
+
+}; // namespace
+
+bool Arch::is_lab_ctrlset_legal(uint32_t lab) const
+{
+    LabCtrlSetWorker worker;
+    return worker.run(this, lab);
+}
+
+void Arch::lab_pre_route()
+{
+    log_info("Preparing LABs for routing...\n");
+    for (uint32_t lab = 0; lab < labs.size(); lab++) {
+        assign_control_sets(lab);
+        for (uint8_t alm = 0; alm < 10; alm++) {
+            reassign_alm_inputs(lab, alm);
+        }
+    }
+}
+
+void Arch::assign_control_sets(uint32_t lab)
+{
+    // Set up reservations for checkPipAvail for control set signals
+    // This will be needed because clock and CE are routed together and must be kept together, there isn't free choice
+    // e.g. CLK0 & ENA0 must be use for one control set, and CLK1 & ENA1 for another, they can't be mixed and matched
+    // Similarly for how inverted & noninverted variants must be kept separate
+    LabCtrlSetWorker worker;
+    bool legal = worker.run(this, lab);
+    NPNR_ASSERT(legal);
+    auto &lab_data = labs.at(lab);
+
+    for (int j = 0; j < 2; j++) {
+        lab_data.aclr_used[j] = false;
+    }
+
+    for (uint8_t alm = 0; alm < 10; alm++) {
+        auto &alm_data = lab_data.alms.at(alm);
+        for (uint8_t i = 0; i < 4; i++) {
+            BelId ff_bel = alm_data.ff_bels.at(i);
+            const CellInfo *ff = getBoundBelCell(ff_bel);
+            if (ff == nullptr)
+                continue;
+            ControlSig ena_sig = ff->ffInfo.ctrlset.ena;
+            WireId clk_wire = getBelPinWire(ff_bel, id_CLK);
+            WireId ena_wire = getBelPinWire(ff_bel, id_ENA);
+            for (int j = 0; j < 3; j++) {
+                if (ena_sig == worker.datain[ena_datain[j]]) {
+                    if (getCtx()->debug) {
+                        log_info("Assigned CLK/ENA set %d to FF %s (%s)\n", i, nameOf(ff), getCtx()->nameOfBel(ff_bel));
+                    }
+                    // TODO: lock clock according to ENA choice, too, when we support two clocks per ALM
+                    reserve_route(lab_data.clk_wires[0], clk_wire);
+                    reserve_route(lab_data.ena_wires[j], ena_wire);
+                    alm_data.clk_ena_idx[i / 2] = j;
+                    break;
+                }
+            }
+
+            ControlSig aclr_sig = ff->ffInfo.ctrlset.aclr;
+            WireId aclr_wire = getBelPinWire(ff_bel, id_ACLR);
+            for (int j = 0; j < 2; j++) {
+                // TODO: could be global ACLR, too
+                if (aclr_sig == worker.datain[aclr_datain[j]]) {
+                    if (getCtx()->debug) {
+                        log_info("Assigned ACLR set %d to FF %s (%s)\n", i, nameOf(ff), getCtx()->nameOfBel(ff_bel));
+                    }
+                    reserve_route(lab_data.aclr_wires[j], aclr_wire);
+                    lab_data.aclr_used[j] = (aclr_sig.net != nullptr);
+                    alm_data.aclr_idx[i / 2] = j;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+namespace {
+// Gets the name of logical LUT pin i for a given cell
+static IdString get_lut_pin(CellInfo *cell, int i)
+{
+    const std::array<IdString, 6> log_pins{id_A, id_B, id_C, id_D, id_E, id_F};
+    const std::array<IdString, 5> log_pins_arith{id_A, id_B, id_C, id_D0, id_D1};
+    return (cell->type == id_MISTRAL_ALUT_ARITH) ? log_pins_arith.at(i) : log_pins.at(i);
+}
+
+static void assign_lut6_inputs(CellInfo *cell, int lut)
+{
+    std::array<IdString, 6> phys_pins{id_A, id_B, id_C, id_D, (lut == 1) ? id_E1 : id_E0, (lut == 1) ? id_F1 : id_F0};
+    int phys_idx = 0;
+    for (int i = 0; i < 6; i++) {
+        IdString log = get_lut_pin(cell, i);
+        if (!cell->ports.count(log) || cell->ports.at(log).net == nullptr)
+            continue;
+        cell->pin_data[log].bel_pins.clear();
+        cell->pin_data[log].bel_pins.push_back(phys_pins.at(phys_idx++));
+    }
+}
+} // namespace
+
+void Arch::reassign_alm_inputs(uint32_t lab, uint8_t alm)
+{
+    // Based on the usage of LUTs inside the ALM, set up cell-bel pin map for the combinational cells in the ALM
+    // so that each physical bel pin is only used for one net; and the logical functions can be implemented correctly.
+    // This function should also insert route-through LUTs to legalise flipflop inputs as needed.
+    auto &alm_data = labs.at(lab).alms.at(alm);
+    alm_data.l6_mode = false;
+    std::array<CellInfo *, 2> luts{getBoundBelCell(alm_data.lut_bels[0]), getBoundBelCell(alm_data.lut_bels[1])};
+    std::array<CellInfo *, 4> ffs{getBoundBelCell(alm_data.ff_bels[0]), getBoundBelCell(alm_data.ff_bels[1]),
+                                  getBoundBelCell(alm_data.ff_bels[2]), getBoundBelCell(alm_data.ff_bels[3])};
+
+    for (int i = 0; i < 2; i++) {
+        // Currently we treat LUT6s as a special case, as they never share inputs
+        if (luts[i] != nullptr && luts[i]->type == id_MISTRAL_ALUT6) {
+            alm_data.l6_mode = true;
+            NPNR_ASSERT(luts[1 - i] == nullptr); // only allow one LUT6 per ALM and no other LUTs
+            assign_lut6_inputs(luts[i], i);
+        }
+    }
+
+    if (!alm_data.l6_mode) {
+        // In L5 mode; which is what we use in this case
+        //  - A and B are shared
+        //  - C, E0, and F0 are exclusive to the top LUT5 secion
+        //  - D, E1, and F1 are exclusive to the bottom LUT5 section
+        // First find up to two shared inputs
+        std::unordered_map<IdString, int> shared_nets;
+        if (luts[0] && luts[1]) {
+            for (int i = 0; i < luts[0]->combInfo.lut_input_count; i++) {
+                for (int j = 0; j < luts[1]->combInfo.lut_input_count; j++) {
+                    if (luts[0]->combInfo.lut_in[i] == nullptr)
+                        continue;
+                    if (luts[0]->combInfo.lut_in[i] != luts[1]->combInfo.lut_in[j])
+                        continue;
+                    IdString net = luts[0]->combInfo.lut_in[i]->name;
+                    if (shared_nets.count(net))
+                        continue;
+                    int idx = int(shared_nets.size());
+                    shared_nets[net] = idx;
+                    if (shared_nets.size() >= 2)
+                        goto shared_search_done;
+                }
+            }
+        shared_search_done:;
+        }
+        // A and B can be used for half-specific nets if not assigned to shared nets
+        bool a_avail = shared_nets.size() == 0, b_avail = shared_nets.size() <= 1;
+        // Do the actual port assignment
+        for (int i = 0; i < 2; i++) {
+            if (!luts[i])
+                continue;
+            // Work out which physical ports are available
+            std::vector<IdString> avail_phys_ports;
+            // D/C always available and dedicated to the half, in L5 mode
+            avail_phys_ports.push_back((i == 1) ? id_D : id_C);
+            // In arithmetic mode, Ei can only be used for D0 and Fi can only be used for D1
+            // otherwise, these are general and dedicated to one half
+            if (!luts[i]->combInfo.is_carry) {
+                avail_phys_ports.push_back((i == 1) ? id_E1 : id_E0);
+                avail_phys_ports.push_back((i == 1) ? id_F1 : id_F0);
+            }
+            // A and B might be used for shared signals, or already used by the other half
+            if (b_avail)
+                avail_phys_ports.push_back(id_B);
+            if (a_avail)
+                avail_phys_ports.push_back(id_A);
+            int phys_idx = 0;
+
+            for (int j = 0; j < luts[i]->combInfo.lut_input_count; j++) {
+                IdString log = get_lut_pin(luts[i], j);
+                auto &bel_pins = luts[i]->pin_data[log].bel_pins;
+                bel_pins.clear();
+
+                NetInfo *net = get_net_or_empty(luts[i], log);
+                if (net == nullptr) {
+                    // Disconnected inputs don't need to be allocated a pin, because the router won't be routing these
+                    continue;
+                } else if (shared_nets.count(net->name)) {
+                    // This pin is to be allocated one of the shared nets
+                    bel_pins.push_back(shared_nets.at(net->name) ? id_B : id_A);
+                } else if (log == id_D0) {
+                    // Arithmetic
+                    bel_pins.push_back((i == 1) ? id_E1 : id_E0); // reserved
+                } else if (log == id_D1) {
+                    bel_pins.push_back((i == 1) ? id_F1 : id_F0); // reserved
+                } else {
+                    // Allocate from the general pool of available physical pins
+                    IdString phys = avail_phys_ports.at(phys_idx++);
+                    bel_pins.push_back(phys);
+                    // Mark A/B unavailable for the other LUT, if needed
+                    if (phys == id_A)
+                        a_avail = false;
+                    else if (phys == id_B)
+                        b_avail = false;
+                }
+            }
+        }
+    }
+
+    // FF route-through insertion
+    for (int i = 0; i < 2; i++) {
+        // FF route-through will never be inserted if LUT is used
+        if (luts[i])
+            continue;
+        for (int j = 0; j < 2; j++) {
+            CellInfo *ff = ffs[i * 2 + j];
+            if (!ff || !ff->ffInfo.datain || alm_data.l6_mode)
+                continue;
+            CellInfo *rt_lut = createCell(id(stringf("%s$ROUTETHRU", nameOf(ff))), id_MISTRAL_BUF);
+            rt_lut->addInput(id_A);
+            rt_lut->addOutput(id_Q);
+            // Disconnect the original data input to the FF, and connect it to the route-thru LUT instead
+            NetInfo *datain = get_net_or_empty(ff, id_DATAIN);
+            disconnect_port(getCtx(), ff, id_DATAIN);
+            connect_port(getCtx(), datain, rt_lut, id_A);
+            connect_ports(getCtx(), rt_lut, id_Q, ff, id_DATAIN);
+            // Assign route-thru LUT physical ports, input goes to the first half-specific input
+            rt_lut->pin_data[id_A].bel_pins.push_back(i ? id_D : id_C);
+            rt_lut->pin_data[id_Q].bel_pins.push_back(id_COMBOUT);
+            assign_comb_info(rt_lut);
+            // Place the route-thru LUT at the relevant combinational bel
+            bindBel(alm_data.lut_bels[i], rt_lut, STRENGTH_STRONG);
+            break;
+        }
+    }
+
+    // TODO: in the future, as well as the reassignment here we will also have pseudo PIPs in front of the ALM so that
+    // the router can permute LUTs for routeability; too. Here we will need to lock out some of those PIPs depending on
+    // the usage of the ALM, as not all inputs are always interchangeable.
+    // Get cells into an array for fast access
+}
+
+// This default cell-bel pin mapping is used to provide estimates during placement only. It will have errors and
+// overlaps and a correct mapping will be resolved twixt placement and routing
+const std::unordered_map<IdString, IdString> Arch::comb_pinmap = {
+        {id_A, id_F0}, // fastest input first
+        {id_B, id_E0}, {id_C, id_D}, {id_D, id_C},       {id_D0, id_C},       {id_D1, id_B},
+        {id_E, id_B},  {id_F, id_A}, {id_Q, id_COMBOUT}, {id_SO, id_COMBOUT},
+};
+
+namespace {
+// gets the value of the ith LUT init property of a given cell
+uint64_t get_lut_init(const CellInfo *cell, int i)
+{
+    if (cell->type == id_MISTRAL_NOT) {
+        return 1;
+    } else if (cell->type == id_MISTRAL_BUF) {
+        return 2;
+    } else {
+        IdString prop;
+        if (cell->type == id_MISTRAL_ALUT_ARITH)
+            prop = (i == 1) ? id_LUT1 : id_LUT0;
+        else
+            prop = id_LUT;
+        auto fnd = cell->params.find(prop);
+        if (fnd == cell->params.end())
+            return 0;
+        else
+            return fnd->second.as_int64();
+    }
+}
+// gets the state of a physical pin when evaluating the a given bit of LUT init for
+bool get_phys_pin_val(bool l6_mode, bool arith_mode, int bit, IdString pin)
+{
+    switch (pin.index) {
+    case ID_A:
+        return (bit >> 0) & 0x1;
+    case ID_B:
+        return (bit >> 1) & 0x1;
+    case ID_C:
+        return (l6_mode && bit >= 32) ? ((bit >> 3) & 0x1) : ((bit >> 2) & 0x1);
+    case ID_D:
+        return (l6_mode && bit < 32) ? ((bit >> 3) & 0x1) : ((bit >> 2) & 0x1);
+    case ID_E0:
+    case ID_E1:
+        return l6_mode ? ((bit >> 5) & 0x1) : ((bit >> 3) & 0x1);
+    case ID_F0:
+    case ID_F1:
+        return arith_mode ? ((bit >> 3) & 0x1) : ((bit >> 4) & 0x1);
+    default:
+        NPNR_ASSERT_FALSE("unknown physical pin!");
+    }
+}
+} // namespace
+
+uint64_t Arch::compute_lut_mask(uint32_t lab, uint8_t alm)
+{
+    uint64_t mask = 0;
+    auto &alm_data = labs.at(lab).alms.at(alm);
+    std::array<CellInfo *, 2> luts{getBoundBelCell(alm_data.lut_bels[0]), getBoundBelCell(alm_data.lut_bels[1])};
+
+    for (int i = 0; i < 2; i++) {
+        CellInfo *lut = luts[i];
+        if (!lut)
+            continue;
+        int offset = ((i == 1) && !alm_data.l6_mode) ? 32 : 0;
+        bool arith = lut->combInfo.is_carry;
+        for (int j = 0; j < (alm_data.l6_mode ? 64 : 32); j++) {
+            // Evaluate LUT function at this point
+            uint64_t init = get_lut_init(lut, (arith && j >= 16) ? 1 : 0);
+            int index = 0;
+            for (int k = 0; k < lut->combInfo.lut_input_count; k++) {
+                IdString log_pin = get_lut_pin(lut, k);
+                int init_idx = k;
+                if (arith) {
+                    // D0 only affects lower half; D1 upper half
+                    if (k == 3 && j >= 16)
+                        continue;
+                    if (k == 4) {
+                        if (j < 16)
+                            continue;
+                        else
+                            init_idx = 3;
+                    }
+                }
+                CellPinState state = lut->get_pin_state(log_pin);
+                if (state == PIN_0)
+                    continue;
+                else if (state == PIN_1)
+                    index |= (1 << init_idx);
+                // Ignore if no associated physical pin
+                if (get_net_or_empty(lut, log_pin) == nullptr || lut->pin_data.at(log_pin).bel_pins.empty())
+                    continue;
+                // ALM inputs appear to be inverted by default (TODO: check!)
+                // so only invert if an inverter has _not_ been folded into the pin
+                bool inverted = (state != PIN_INV);
+                // Depermute physical pin
+                IdString phys_pin = lut->pin_data.at(log_pin).bel_pins.at(0);
+                if (get_phys_pin_val(alm_data.l6_mode, arith, j, phys_pin) != inverted)
+                    index |= (1 << init_idx);
+            }
+            if ((init >> index) & 0x1) {
+                mask |= (1ULL << uint64_t(j + offset));
+            }
+        }
+    }
+
+    // TODO: always inverted, or just certain paths?
+    mask = ~mask;
+
+#if 1
+    if (getCtx()->debug) {
+        auto pos = alm_data.lut_bels[0].pos;
+        log("ALM %03d.%03d.%d\n", CycloneV::pos2x(pos), CycloneV::pos2y(pos), alm);
+        for (int i = 0; i < 2; i++) {
+            log("    LUT%d: ", i);
+            if (luts[i]) {
+                log("%s:%s", nameOf(luts[i]), nameOf(luts[i]->type));
+                for (auto &pin : luts[i]->pin_data) {
+                    if (!luts[i]->ports.count(pin.first) || luts[i]->ports.at(pin.first).type != PORT_IN)
+                        continue;
+                    log(" %s:", nameOf(pin.first));
+                    if (pin.second.state == PIN_0)
+                        log("0");
+                    else if (pin.second.state == PIN_1)
+                        log("1");
+                    else if (pin.second.state == PIN_INV)
+                        log("~");
+                    for (auto bp : pin.second.bel_pins)
+                        log("%s", nameOf(bp));
+                }
+            } else {
+                log("<null>");
+            }
+            log("\n");
+        }
+        log("INIT: %016lx\n", mask);
+        log("\n");
+    }
+#endif
+
+    return mask;
+}
+
+NEXTPNR_NAMESPACE_END