Split up common into kernel,place,route

Signed-off-by: gatecat <gatecat@ds0.me>
author: gatecat <gatecat@ds0.me> 2022-04-08 13:42:54 +0100
committer: gatecat <gatecat@ds0.me> 2022-04-08 13:42:54 +0100
commit: 49f178ed94b5fad00d25dbd12adea0bf4732f803 (patch)
tree: ea642e20bc07441a800944390e1f904e6ce5b113 /common/place
parent: e42e22575f20b59634f88b5cf694efdb413ff0a0 (diff)
download: nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.tar.gz
nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.tar.bz2
nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.zip
11 files changed, 5580 insertions, 0 deletions
diff --git a/common/place/fast_bels.h b/common/place/fast_bels.h
new file mode 100644
index 00000000..ba9938c6
--- /dev/null
+++ b/common/place/fast_bels.h
@@ -0,0 +1,188 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  Claire Xenia Wolf <claire@yosyshq.com>
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#pragma once
+
+#include <cstddef>
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+// FastBels is a lookup class that provides a fast lookup for finding BELs
+// that support a given cell type.
+struct FastBels
+{
+    struct TypeData
+    {
+        size_t type_index;
+        int number_of_possible_bels;
+    };
+
+    FastBels(Context *ctx, bool check_bel_available, int minBelsForGridPick)
+            : ctx(ctx), check_bel_available(check_bel_available), minBelsForGridPick(minBelsForGridPick)
+    {
+    }
+
+    void addCellType(IdString cell_type)
+    {
+        auto iter = cell_types.find(cell_type);
+        if (iter != cell_types.end()) {
+            // This cell type has already been added to the fast BEL lookup.
+            return;
+        }
+
+        size_t type_idx = cell_types.size();
+        auto &cell_type_data = cell_types[cell_type];
+        cell_type_data.type_index = type_idx;
+
+        fast_bels_by_cell_type.resize(type_idx + 1);
+        auto &bel_data = fast_bels_by_cell_type.at(type_idx);
+        NPNR_ASSERT(bel_data.get() == nullptr);
+        bel_data = std::make_unique<FastBelsData>();
+
+        for (auto bel : ctx->getBels()) {
+            if (!ctx->isValidBelForCellType(cell_type, bel)) {
+                continue;
+            }
+
+            cell_type_data.number_of_possible_bels += 1;
+        }
+
+        for (auto bel : ctx->getBels()) {
+            if (check_bel_available && !ctx->checkBelAvail(bel)) {
+                continue;
+            }
+
+            if (!ctx->isValidBelForCellType(cell_type, bel)) {
+                continue;
+            }
+
+            Loc loc = ctx->getBelLocation(bel);
+            if (minBelsForGridPick >= 0 && cell_type_data.number_of_possible_bels < minBelsForGridPick) {
+                loc.x = loc.y = 0;
+            }
+
+            if (int(bel_data->size()) < (loc.x + 1)) {
+                bel_data->resize(loc.x + 1);
+            }
+
+            if (int(bel_data->at(loc.x).size()) < (loc.y + 1)) {
+                bel_data->at(loc.x).resize(loc.y + 1);
+            }
+
+            bel_data->at(loc.x).at(loc.y).push_back(bel);
+        }
+    }
+
+    void addBelBucket(BelBucketId partition)
+    {
+        auto iter = partition_types.find(partition);
+        if (iter != partition_types.end()) {
+            // This partition has already been added to the fast BEL lookup.
+            return;
+        }
+
+        size_t type_idx = partition_types.size();
+        auto &type_data = partition_types[partition];
+        type_data.type_index = type_idx;
+
+        fast_bels_by_partition_type.resize(type_idx + 1);
+        auto &bel_data = fast_bels_by_partition_type.at(type_idx);
+        NPNR_ASSERT(bel_data.get() == nullptr);
+        bel_data = std::make_unique<FastBelsData>();
+
+        for (auto bel : ctx->getBels()) {
+            if (ctx->getBelBucketForBel(bel) != partition) {
+                continue;
+            }
+
+            type_data.number_of_possible_bels += 1;
+        }
+
+        for (auto bel : ctx->getBels()) {
+            if (check_bel_available && !ctx->checkBelAvail(bel)) {
+                continue;
+            }
+
+            if (ctx->getBelBucketForBel(bel) != partition) {
+                continue;
+            }
+
+            Loc loc = ctx->getBelLocation(bel);
+            if (minBelsForGridPick >= 0 && type_data.number_of_possible_bels < minBelsForGridPick) {
+                loc.x = loc.y = 0;
+            }
+
+            if (int(bel_data->size()) < (loc.x + 1)) {
+                bel_data->resize(loc.x + 1);
+            }
+
+            if (int(bel_data->at(loc.x).size()) < (loc.y + 1)) {
+                bel_data->at(loc.x).resize(loc.y + 1);
+            }
+
+            bel_data->at(loc.x).at(loc.y).push_back(bel);
+        }
+    }
+
+    typedef std::vector<std::vector<std::vector<BelId>>> FastBelsData;
+
+    int getBelsForCellType(IdString cell_type, FastBelsData **data)
+    {
+        auto iter = cell_types.find(cell_type);
+        if (iter == cell_types.end()) {
+            addCellType(cell_type);
+            iter = cell_types.find(cell_type);
+            NPNR_ASSERT(iter != cell_types.end());
+        }
+
+        auto cell_type_data = iter->second;
+
+        *data = fast_bels_by_cell_type.at(cell_type_data.type_index).get();
+        return cell_type_data.number_of_possible_bels;
+    }
+
+    size_t getBelsForBelBucket(BelBucketId partition, FastBelsData **data)
+    {
+        auto iter = partition_types.find(partition);
+        if (iter == partition_types.end()) {
+            addBelBucket(partition);
+            iter = partition_types.find(partition);
+            NPNR_ASSERT(iter != partition_types.end());
+        }
+
+        auto type_data = iter->second;
+
+        *data = fast_bels_by_partition_type.at(type_data.type_index).get();
+        return type_data.number_of_possible_bels;
+    }
+
+    Context *ctx;
+    const bool check_bel_available;
+    const int minBelsForGridPick;
+
+    dict<IdString, TypeData> cell_types;
+    std::vector<std::unique_ptr<FastBelsData>> fast_bels_by_cell_type;
+
+    dict<BelBucketId, TypeData> partition_types;
+    std::vector<std::unique_ptr<FastBelsData>> fast_bels_by_partition_type;
+};
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/parallel_refine.cc b/common/place/parallel_refine.cc
new file mode 100644
index 00000000..a868ca58
--- /dev/null
+++ b/common/place/parallel_refine.cc
@@ -0,0 +1,959 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021-22  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "parallel_refine.h"
+#include "log.h"
+
+#if !defined(__wasm)
+
+#include "fast_bels.h"
+#include "scope_lock.h"
+#include "timing.h"
+
+#include <chrono>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <thread>
+
+NEXTPNR_NAMESPACE_BEGIN
+
+namespace {
+struct Partition
+{
+    int x0, y0, x1, y1;
+    std::vector<CellInfo *> cells;
+    Partition() = default;
+    explicit Partition(Context *ctx)
+    {
+        x0 = ctx->getGridDimX();
+        y0 = ctx->getGridDimY();
+        x1 = 0;
+        y1 = 0;
+        for (auto &cell : ctx->cells) {
+            Loc l = ctx->getBelLocation(cell.second->bel);
+            x0 = std::min(x0, l.x);
+            x1 = std::max(x1, l.x);
+            y0 = std::min(y0, l.y);
+            y1 = std::max(y1, l.y);
+            cells.push_back(cell.second.get());
+        }
+    }
+    void split(Context *ctx, bool yaxis, float pivot, Partition &l, Partition &r)
+    {
+        std::sort(cells.begin(), cells.end(), [&](CellInfo *a, CellInfo *b) {
+            Loc l0 = ctx->getBelLocation(a->bel), l1 = ctx->getBelLocation(b->bel);
+            return yaxis ? (l0.y < l1.y) : (l0.x < l1.x);
+        });
+        size_t pivot_point = size_t(cells.size() * pivot);
+        l.cells.clear();
+        r.cells.clear();
+        l.cells.reserve(pivot_point);
+        r.cells.reserve(cells.size() - pivot_point);
+        int pivot_coord = (pivot_point == 0) ? (yaxis ? y1 : x1)
+                                             : (yaxis ? ctx->getBelLocation(cells.at(pivot_point - 1)->bel).y
+                                                      : ctx->getBelLocation(cells.at(pivot_point - 1)->bel).x);
+        for (size_t i = 0; i < cells.size(); i++) {
+            Loc loc = ctx->getBelLocation(cells.at(i)->bel);
+            ((yaxis ? loc.y : loc.x) <= pivot_coord ? l.cells : r.cells).push_back(cells.at(i));
+        }
+        if (yaxis) {
+            l.x0 = r.x0 = x0;
+            l.x1 = r.x1 = x1;
+            l.y0 = y0;
+            l.y1 = pivot_coord;
+            r.y0 = (pivot_coord == y1) ? y1 : (pivot_coord + 1);
+            r.y1 = y1;
+        } else {
+            l.y0 = r.y0 = y0;
+            l.y1 = r.y1 = y1;
+            l.x0 = x0;
+            l.x1 = pivot_coord;
+            r.x0 = (pivot_coord == x1) ? x1 : (pivot_coord + 1);
+            r.x1 = x1;
+        }
+    }
+};
+
+typedef int64_t wirelen_t;
+
+struct NetBB
+{
+    // Actual bounding box
+    int x0 = 0, x1 = 0, y0 = 0, y1 = 0;
+    // Number of cells at each extremity
+    int nx0 = 0, nx1 = 0, ny0 = 0, ny1 = 0;
+    wirelen_t hpwl(const ParallelRefineCfg &cfg) const
+    {
+        return wirelen_t(cfg.hpwl_scale_x * (x1 - x0) + cfg.hpwl_scale_y * (y1 - y0));
+    }
+    static NetBB compute(const Context *ctx, const NetInfo *net, const dict<IdString, BelId> *cell2bel = nullptr)
+    {
+        NetBB result{};
+        if (!net->driver.cell)
+            return result;
+        auto bel_loc = [&](const CellInfo *cell) {
+            BelId bel = cell2bel ? cell2bel->at(cell->name) : cell->bel;
+            return ctx->getBelLocation(bel);
+        };
+        result.nx0 = result.nx1 = result.ny0 = result.ny1 = 1;
+        Loc drv_loc = bel_loc(net->driver.cell);
+        result.x0 = result.x1 = drv_loc.x;
+        result.y0 = result.y1 = drv_loc.y;
+        for (auto &usr : net->users) {
+            Loc l = bel_loc(usr.cell);
+            if (l.x == result.x0)
+                ++result.nx0; // on the edge
+            else if (l.x < result.x0) {
+                result.x0 = l.x; // extends the edge
+                result.nx0 = 1;
+            }
+            if (l.x == result.x1)
+                ++result.nx1; // on the edge
+            else if (l.x > result.x1) {
+                result.x1 = l.x; // extends the edge
+                result.nx1 = 1;
+            }
+            if (l.y == result.y0)
+                ++result.ny0; // on the edge
+            else if (l.y < result.y0) {
+                result.y0 = l.y; // extends the edge
+                result.ny0 = 1;
+            }
+            if (l.y == result.y1)
+                ++result.ny1; // on the edge
+            else if (l.y > result.y1) {
+                result.y1 = l.y; // extends the edge
+                result.ny1 = 1;
+            }
+        }
+        return result;
+    }
+};
+
+struct GlobalState
+{
+    explicit GlobalState(Context *ctx, ParallelRefineCfg cfg) : ctx(ctx), cfg(cfg), bels(ctx, false, 64), tmg(ctx){};
+    Context *ctx;
+    ParallelRefineCfg cfg;
+    FastBels bels;
+    std::vector<NetInfo *> flat_nets; // flat array of all nets in the design for fast referencing by index
+    std::vector<NetBB> last_bounds;
+    std::vector<std::vector<double>> last_tmg_costs;
+    dict<IdString, NetBB> region_bounds;
+    TimingAnalyser tmg;
+
+    std::shared_timed_mutex archapi_mutex;
+
+    double temperature = 1e-7;
+    int radius = 3;
+
+    wirelen_t total_wirelen = 0;
+    double total_timing_cost = 0;
+
+    double get_timing_cost(const NetInfo *net, store_index<PortRef> user,
+                           const dict<IdString, BelId> *cell2bel = nullptr)
+    {
+        if (!net->driver.cell)
+            return 0;
+        const auto &sink = net->users.at(user);
+        IdString driver_pin, sink_pin;
+        // Pick the first pin for a prediction; assume all will be similar enouhg
+        for (auto pin : ctx->getBelPinsForCellPin(net->driver.cell, net->driver.port)) {
+            driver_pin = pin;
+            break;
+        }
+        for (auto pin : ctx->getBelPinsForCellPin(sink.cell, sink.port)) {
+            sink_pin = pin;
+            break;
+        }
+        float crit = tmg.get_criticality(CellPortKey(sink));
+        BelId src_bel = cell2bel ? cell2bel->at(net->driver.cell->name) : net->driver.cell->bel;
+        BelId dst_bel = cell2bel ? cell2bel->at(sink.cell->name) : sink.cell->bel;
+        double delay = ctx->getDelayNS(ctx->predictDelay(src_bel, driver_pin, dst_bel, sink_pin));
+        return delay * std::pow(crit, cfg.crit_exp);
+    }
+
+    bool skip_net(const NetInfo *net) const
+    {
+        if (!net->driver.cell)
+            return true;
+        if (ctx->getBelGlobalBuf(net->driver.cell->bel))
+            return true;
+        return false;
+    }
+    bool timing_skip_net(const NetInfo *net) const
+    {
+        if (!net->driver.cell)
+            return true;
+        int cc;
+        auto cls = ctx->getPortTimingClass(net->driver.cell, net->driver.port, cc);
+        if (cls == TMG_IGNORE || cls == TMG_GEN_CLOCK)
+            return true;
+        return false;
+    }
+    // ....
+};
+
+struct ThreadState
+{
+    Context *ctx;         // Nextpnr context pointer
+    GlobalState &g;       // Refinement engine state
+    int idx;              // Index of the thread
+    DeterministicRNG rng; // Local RNG
+    // The cell partition that the thread works on
+    Partition p;
+    // Mapping from design-wide net index to thread-wide net index -- not all nets are in all partitions, so we can
+    // optimise
+    std::vector<int> thread_net_idx;
+    // List of nets inside the partition; and their committed bounding boxes & timing costs from the thread's
+    // perspective
+    std::vector<NetInfo *> thread_nets;
+    std::vector<NetBB> net_bounds;
+    std::vector<std::vector<double>> arc_tmg_cost;
+    std::vector<bool> ignored_nets, tmg_ignored_nets;
+    bool arch_state_dirty = false;
+    // Our local cell-bel map; that won't be affected by out-of-partition moves
+    dict<IdString, BelId> local_cell2bel;
+
+    // Data on an inflight move
+    dict<IdString, std::pair<BelId, BelId>> moved_cells; // cell -> (old; new)
+    // For cluster moves only
+    std::vector<std::pair<CellInfo *, Loc>> cell_rel;
+    // For incremental wirelength and delay updates
+    wirelen_t wirelen_delta = 0;
+    double timing_delta = 0;
+
+    // Total made and accepted moved
+    int n_move = 0, n_accept = 0;
+
+    enum BoundChange
+    {
+        NO_CHANGE,
+        CELL_MOVED_INWARDS,
+        CELL_MOVED_OUTWARDS,
+        FULL_RECOMPUTE
+    };
+    // Wirelen related are handled on a per-axis basis to reduce
+    struct AxisChanges
+    {
+        std::vector<int> bounds_changed_nets;
+        std::vector<BoundChange> already_bounds_changed;
+    };
+    std::array<AxisChanges, 2> axes;
+    std::vector<NetBB> new_net_bounds;
+
+    std::vector<std::vector<bool>> already_timing_changed;
+    std::vector<std::pair<int, store_index<PortRef>>> timing_changed_arcs;
+    std::vector<double> new_timing_costs;
+
+    ThreadState(Context *ctx, GlobalState &g, int idx) : ctx(ctx), g(g), idx(idx){};
+    void set_partition(const Partition &part)
+    {
+        p = part;
+        thread_nets.clear();
+        thread_net_idx.resize(g.flat_nets.size());
+        std::fill(thread_net_idx.begin(), thread_net_idx.end(), -1);
+        // Determine the set of nets that are within the thread; and therefore we care about
+        for (auto thread_cell : part.cells) {
+            for (auto &port : thread_cell->ports) {
+                if (!port.second.net)
+                    continue;
+                int global_idx = port.second.net->udata;
+                auto &thread_idx = thread_net_idx.at(global_idx);
+                // Already added to the set
+                if (thread_idx != -1)
+                    continue;
+                thread_idx = thread_nets.size();
+                thread_nets.push_back(port.second.net);
+            }
+        }
+        tmg_ignored_nets.clear();
+        ignored_nets.clear();
+        for (auto tn : thread_nets) {
+            ignored_nets.push_back(g.skip_net(tn));
+            tmg_ignored_nets.push_back(g.timing_skip_net(tn));
+        }
+        // Set up the original cell-bel map for all nets inside the thread
+        local_cell2bel.clear();
+        for (NetInfo *net : thread_nets) {
+            if (net->driver.cell)
+                local_cell2bel[net->driver.cell->name] = net->driver.cell->bel;
+            for (auto &usr : net->users)
+                local_cell2bel[usr.cell->name] = usr.cell->bel;
+        }
+    }
+    void setup_initial_state()
+    {
+        // Setup initial net bounding boxes and timing costs
+        net_bounds.clear();
+        arc_tmg_cost.clear();
+        for (auto tn : thread_nets) {
+            net_bounds.push_back(g.last_bounds.at(tn->udata));
+            arc_tmg_cost.push_back(g.last_tmg_costs.at(tn->udata));
+        }
+        new_net_bounds = net_bounds;
+        for (int j = 0; j < 2; j++) {
+            auto &a = axes.at(j);
+            a.already_bounds_changed.resize(net_bounds.size());
+        }
+        already_timing_changed.clear();
+        already_timing_changed.resize(net_bounds.size());
+        for (size_t i = 0; i < thread_nets.size(); i++)
+            already_timing_changed.at(i) = std::vector<bool>(thread_nets.at(i)->users.capacity());
+    }
+    bool bounds_check(BelId bel)
+    {
+        Loc l = ctx->getBelLocation(bel);
+        if (l.x < p.x0 || l.x > p.x1 || l.y < p.y0 || l.y > p.y1)
+            return false;
+        return true;
+    }
+    bool bind_move()
+    {
+        std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+        for (auto &entry : moved_cells) {
+            ctx->unbindBel(entry.second.first);
+        }
+        bool success = true;
+        for (auto &entry : moved_cells) {
+            // Make sure targets are available before we bind them
+            if (!ctx->checkBelAvail(entry.second.second)) {
+                success = false;
+                break;
+            }
+            ctx->bindBel(entry.second.second, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
+        }
+        arch_state_dirty = true;
+        return success;
+    }
+    bool check_validity()
+    {
+        std::shared_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+        bool result = true;
+        for (auto e : moved_cells) {
+            if (!ctx->isBelLocationValid(e.second.first)) {
+                // Have to check old; too; as unbinding a bel could make a placement illegal by virtue of no longer
+                // enabling dedicated routes to be used
+                result = false;
+                break;
+            }
+            if (!ctx->isBelLocationValid(e.second.second)) {
+                result = false;
+                break;
+            }
+        }
+        return result;
+    }
+    void revert_move()
+    {
+        if (arch_state_dirty) {
+            // If changes to the arch state were made, revert them by restoring original cell bindings
+            std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex);
+            for (auto &entry : moved_cells) {
+                BelId curr_bound = ctx->cells.at(entry.first)->bel;
+                if (curr_bound != BelId())
+                    ctx->unbindBel(curr_bound);
+            }
+            for (auto &entry : moved_cells) {
+                ctx->bindBel(entry.second.first, ctx->cells.at(entry.first).get(), STRENGTH_WEAK);
+            }
+            arch_state_dirty = false;
+        }
+        for (auto &entry : moved_cells)
+            local_cell2bel[entry.first] = entry.second.first;
+    }
+    void commit_move()
+    {
+        arch_state_dirty = false;
+        for (auto &axis : axes) {
+            for (auto bc : axis.bounds_changed_nets) {
+                // Commit updated net bounds
+                net_bounds.at(bc) = new_net_bounds.at(bc);
+            }
+        }
+        if (g.cfg.timing_driven) {
+            NPNR_ASSERT(timing_changed_arcs.size() == new_timing_costs.size());
+            for (size_t i = 0; i < timing_changed_arcs.size(); i++) {
+                auto arc = timing_changed_arcs.at(i);
+                arc_tmg_cost.at(arc.first).at(arc.second.idx()) = new_timing_costs.at(i);
+            }
+        }
+    }
+    void compute_changes_for_cell(CellInfo *cell, BelId old_bel, BelId new_bel)
+    {
+        Loc new_loc = ctx->getBelLocation(new_bel);
+        Loc old_loc = ctx->getBelLocation(old_bel);
+        for (const auto &port : cell->ports) {
+            NetInfo *pn = port.second.net;
+            if (!pn)
+                continue;
+            int idx = thread_net_idx.at(pn->udata);
+            if (ignored_nets.at(idx))
+                continue;
+            NetBB &new_bounds = new_net_bounds.at(idx);
+            // For the x-axis (i=0) and y-axis (i=1)
+            for (int i = 0; i < 2; i++) {
+                auto &axis = axes.at(i);
+                // New and old on this axis
+                int new_pos = i ? new_loc.y : new_loc.x, old_pos = i ? old_loc.y : old_loc.x;
+                // References to updated bounding box entries
+                auto &b0 = i ? new_bounds.y0 : new_bounds.x0;
+                auto &n0 = i ? new_bounds.ny0 : new_bounds.nx0;
+                auto &b1 = i ? new_bounds.y1 : new_bounds.x1;
+                auto &n1 = i ? new_bounds.ny1 : new_bounds.nx1;
+                auto &change = axis.already_bounds_changed.at(idx);
+                // Lower bound
+                if (new_pos < b0) {
+                    // Further out than current lower bound
+                    b0 = new_pos;
+                    n0 = 1;
+                    if (change == NO_CHANGE) {
+                        change = CELL_MOVED_OUTWARDS;
+                        axis.bounds_changed_nets.push_back(idx);
+                    }
+                } else if (new_pos == b0 && old_pos > b0) {
+                    // Moved from inside into current bound
+                    ++n0;
+                    if (change == NO_CHANGE) {
+                        change = CELL_MOVED_OUTWARDS;
+                        axis.bounds_changed_nets.push_back(idx);
+                    }
+                } else if (old_pos == b0 && new_pos > b0) {
+                    // Moved from current bound to inside
+                    if (change == NO_CHANGE)
+                        axis.bounds_changed_nets.push_back(idx);
+                    if (n0 == 1) {
+                        // Was the last cell on the bound; have to do a full recompute
+                        change = FULL_RECOMPUTE;
+                    } else {
+                        --n0;
+                        if (change == NO_CHANGE)
+                            change = CELL_MOVED_INWARDS;
+                    }
+                }
+                // Upper bound
+                if (new_pos > b1) {
+                    // Further out than current upper bound
+                    b1 = new_pos;
+                    n1 = new_pos;
+                    if (change == NO_CHANGE) {
+                        change = CELL_MOVED_OUTWARDS;
+                        axis.bounds_changed_nets.push_back(idx);
+                    }
+                } else if (new_pos == b1 && old_pos < b1) {
+                    // Moved onto current bound
+                    ++n1;
+                    if (change == NO_CHANGE) {
+                        change = CELL_MOVED_OUTWARDS;
+                        axis.bounds_changed_nets.push_back(idx);
+                    }
+                } else if (old_pos == b1 && new_pos < b1) {
+                    // Moved from current bound to inside
+                    if (change == NO_CHANGE)
+                        axis.bounds_changed_nets.push_back(idx);
+                    if (n1 == 1) {
+                        // Was the last cell on the bound; have to do a full recompute
+                        change = FULL_RECOMPUTE;
+                    } else {
+                        --n1;
+                        if (change == NO_CHANGE)
+                            change = CELL_MOVED_INWARDS;
+                    }
+                }
+            }
+            // Timing updates if timing driven
+            if (g.cfg.timing_driven && !tmg_ignored_nets.at(idx)) {
+                if (port.second.type == PORT_OUT) {
+                    int cc;
+                    TimingPortClass cls = ctx->getPortTimingClass(cell, port.first, cc);
+                    if (cls != TMG_IGNORE) {
+                        for (auto usr : pn->users.enumerate())
+                            if (!already_timing_changed.at(idx).at(usr.index.idx())) {
+                                timing_changed_arcs.emplace_back(std::make_pair(idx, usr.index));
+                                already_timing_changed.at(idx).at(usr.index.idx()) = true;
+                            }
+                    }
+                } else {
+                    auto usr = port.second.user_idx;
+                    if (!already_timing_changed.at(idx).at(usr.idx())) {
+                        timing_changed_arcs.emplace_back(std::make_pair(idx, usr));
+                        already_timing_changed.at(idx).at(usr.idx()) = true;
+                    }
+                }
+            }
+        }
+    }
+    void compute_total_change()
+    {
+        auto &xa = axes.at(0), &ya = axes.at(1);
+        for (auto &bc : xa.bounds_changed_nets)
+            if (xa.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
+                new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
+        for (auto &bc : ya.bounds_changed_nets)
+            if (xa.already_bounds_changed.at(bc) != FULL_RECOMPUTE &&
+                ya.already_bounds_changed.at(bc) == FULL_RECOMPUTE)
+                new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel);
+        for (auto &bc : xa.bounds_changed_nets)
+            wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg));
+        for (auto &bc : ya.bounds_changed_nets)
+            if (xa.already_bounds_changed.at(bc) == NO_CHANGE)
+                wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg));
+        if (g.cfg.timing_driven) {
+            NPNR_ASSERT(new_timing_costs.empty());
+            for (auto arc : timing_changed_arcs) {
+                double new_cost = g.get_timing_cost(thread_nets.at(arc.first), arc.second, &local_cell2bel);
+                timing_delta += (new_cost - arc_tmg_cost.at(arc.first).at(arc.second.idx()));
+                new_timing_costs.push_back(new_cost);
+            }
+        }
+    }
+    void reset_move_state()
+    {
+        moved_cells.clear();
+        cell_rel.clear();
+        for (auto &axis : axes) {
+            for (auto bc : axis.bounds_changed_nets) {
+                new_net_bounds.at(bc) = net_bounds.at(bc);
+                axis.already_bounds_changed[bc] = NO_CHANGE;
+            }
+            axis.bounds_changed_nets.clear();
+        }
+        for (auto &arc : timing_changed_arcs) {
+            already_timing_changed.at(arc.first).at(arc.second.idx()) = false;
+        }
+        timing_changed_arcs.clear();
+        new_timing_costs.clear();
+        wirelen_delta = 0;
+        timing_delta = 0;
+    }
+
+    bool accept_move()
+    {
+        static constexpr double epsilon = 1e-20;
+        double delta = g.cfg.lambda * (timing_delta / std::max<double>(epsilon, g.total_timing_cost)) +
+                       (1.0 - g.cfg.lambda) * (double(wirelen_delta) / std::max<double>(epsilon, g.total_wirelen));
+        return delta < 0 ||
+               (g.temperature > 1e-8 && (rng.rng() / float(0x3fffffff)) <= std::exp(-delta / g.temperature));
+    }
+
+    bool add_to_move(CellInfo *cell, BelId old_bel, BelId new_bel)
+    {
+        if (!bounds_check(old_bel) || !bounds_check(new_bel))
+            return false;
+        if (!ctx->isValidBelForCellType(cell->type, new_bel))
+            return false;
+        NPNR_ASSERT(!moved_cells.count(cell->name));
+        moved_cells[cell->name] = std::make_pair(old_bel, new_bel);
+        local_cell2bel[cell->name] = new_bel;
+        compute_changes_for_cell(cell, old_bel, new_bel);
+        return true;
+    }
+
+    bool single_cell_swap(CellInfo *cell, BelId new_bel)
+    {
+        NPNR_ASSERT(moved_cells.empty());
+        BelId old_bel = cell->bel;
+        CellInfo *bound = ctx->getBoundBelCell(new_bel);
+        if (bound && (bound->belStrength > STRENGTH_STRONG || bound->cluster != ClusterId()))
+            return false;
+        if (!add_to_move(cell, old_bel, new_bel))
+            goto fail;
+        if (bound && !add_to_move(bound, new_bel, old_bel))
+            goto fail;
+        compute_total_change();
+        // SA acceptance criteria
+
+        if (!accept_move()) {
+            // SA fail
+            goto fail;
+        }
+        // Check validity rules
+        if (!bind_move())
+            goto fail;
+        if (!check_validity())
+            goto fail;
+        // Accepted!
+        commit_move();
+        reset_move_state();
+        return true;
+    fail:
+        revert_move();
+        reset_move_state();
+        return false;
+    }
+
+    bool chain_swap(CellInfo *root_cell, BelId new_root_bel)
+    {
+        NPNR_ASSERT(moved_cells.empty());
+        std::queue<std::pair<ClusterId, BelId>> displaced_clusters;
+        pool<BelId> used_bels;
+        displaced_clusters.emplace(root_cell->cluster, new_root_bel);
+        while (!displaced_clusters.empty()) {
+            std::vector<std::pair<CellInfo *, BelId>> dest_bels;
+            auto cursor = displaced_clusters.front();
+            displaced_clusters.pop();
+            if (!ctx->getClusterPlacement(cursor.first, cursor.second, dest_bels))
+                goto fail;
+            for (const auto &db : dest_bels) {
+                BelId old_bel = db.first->bel;
+                if (moved_cells.count(db.first->name))
+                    goto fail;
+                if (!add_to_move(db.first, old_bel, db.second))
+                    goto fail;
+                if (used_bels.count(db.second))
+                    goto fail;
+                used_bels.insert(db.second);
+
+                CellInfo *bound = ctx->getBoundBelCell(db.second);
+                if (bound) {
+                    if (moved_cells.count(bound->name)) {
+                        // Don't move a cell multiple times in the same go
+                        goto fail;
+                    } else if (bound->belStrength > STRENGTH_STRONG) {
+                        goto fail;
+                    } else if (bound->cluster != ClusterId()) {
+                        // Displace the entire cluster
+                        Loc old_loc = ctx->getBelLocation(old_bel);
+                        Loc bound_loc = ctx->getBelLocation(bound->bel);
+                        Loc root_loc = ctx->getBelLocation(ctx->getClusterRootCell(bound->cluster)->bel);
+                        BelId new_root = ctx->getBelByLocation(Loc(old_loc.x + (root_loc.x - bound_loc.x),
+                                                                   old_loc.y + (root_loc.y - bound_loc.y),
+                                                                   old_loc.z + (root_loc.z - bound_loc.z)));
+                        if (new_root == BelId())
+                            goto fail;
+                        displaced_clusters.emplace(bound->cluster, new_root);
+                    } else {
+                        // Single cell swap
+                        if (used_bels.count(old_bel))
+                            goto fail;
+                        used_bels.insert(old_bel);
+                        if (!add_to_move(bound, bound->bel, old_bel))
+                            goto fail;
+                    }
+                } else if (!ctx->checkBelAvail(db.second)) {
+                    goto fail;
+                }
+            }
+        }
+        compute_total_change();
+        // SA acceptance criteria
+
+        if (!accept_move()) {
+            // SA fail
+            goto fail;
+        }
+        // Check validity rules
+        if (!bind_move())
+            goto fail;
+        if (!check_validity())
+            goto fail;
+        // Accepted!
+        commit_move();
+        reset_move_state();
+        return true;
+    fail:
+        revert_move();
+        reset_move_state();
+        return false;
+    }
+
+    BelId random_bel_for_cell(CellInfo *cell, int force_z = -1)
+    {
+        IdString targetType = cell->type;
+        Loc curr_loc = ctx->getBelLocation(cell->bel);
+        int count = 0;
+
+        int dx = g.radius, dy = g.radius;
+        if (cell->region != nullptr && cell->region->constr_bels) {
+            dx = std::min(g.cfg.hpwl_scale_x * g.radius,
+                          (g.region_bounds[cell->region->name].x1 - g.region_bounds[cell->region->name].x0) + 1);
+            dy = std::min(g.cfg.hpwl_scale_y * g.radius,
+                          (g.region_bounds[cell->region->name].y1 - g.region_bounds[cell->region->name].y0) + 1);
+            // Clamp location to within bounds
+            curr_loc.x = std::max(g.region_bounds[cell->region->name].x0, curr_loc.x);
+            curr_loc.x = std::min(g.region_bounds[cell->region->name].x1, curr_loc.x);
+            curr_loc.y = std::max(g.region_bounds[cell->region->name].y0, curr_loc.y);
+            curr_loc.y = std::min(g.region_bounds[cell->region->name].y1, curr_loc.y);
+        }
+
+        FastBels::FastBelsData *bel_data;
+        auto type_cnt = g.bels.getBelsForCellType(targetType, &bel_data);
+
+        while (true) {
+            int nx = rng.rng(2 * dx + 1) + std::max(curr_loc.x - dx, 0);
+            int ny = rng.rng(2 * dy + 1) + std::max(curr_loc.y - dy, 0);
+            if (type_cnt < 64)
+                nx = ny = 0;
+            if (nx >= int(bel_data->size()))
+                continue;
+            if (ny >= int(bel_data->at(nx).size()))
+                continue;
+            const auto &fb = bel_data->at(nx).at(ny);
+            if (fb.size() == 0)
+                continue;
+            BelId bel = fb.at(rng.rng(int(fb.size())));
+            if (!bounds_check(bel))
+                continue;
+            if (force_z != -1) {
+                Loc loc = ctx->getBelLocation(bel);
+                if (loc.z != force_z)
+                    continue;
+            }
+            if (!cell->testRegion(bel))
+                continue;
+            count++;
+            return bel;
+        }
+    }
+
+    void run_iter()
+    {
+        setup_initial_state();
+        n_accept = 0;
+        n_move = 0;
+        for (int m = 0; m < g.cfg.inner_iters; m++) {
+            for (auto cell : p.cells) {
+                if (cell->belStrength > STRENGTH_STRONG)
+                    continue;
+                if (cell->cluster != ClusterId()) {
+                    if (cell != ctx->getClusterRootCell(cell->cluster))
+                        continue; // only move cluster root
+                    Loc old_loc = ctx->getBelLocation(cell->bel);
+                    BelId new_root = random_bel_for_cell(cell, old_loc.z);
+                    if (new_root == BelId() || new_root == cell->bel)
+                        continue;
+                    ++n_move;
+                    if (chain_swap(cell, new_root))
+                        ++n_accept;
+                } else {
+                    BelId new_bel = random_bel_for_cell(cell);
+                    if (new_bel == BelId() || new_bel == cell->bel)
+                        continue;
+                    ++n_move;
+                    if (single_cell_swap(cell, new_bel))
+                        ++n_accept;
+                }
+            }
+        }
+    }
+};
+
+struct ParallelRefine
+{
+    Context *ctx;
+    GlobalState g;
+    std::vector<ThreadState> t;
+    ParallelRefine(Context *ctx, ParallelRefineCfg cfg) : ctx(ctx), g(ctx, cfg)
+    {
+        g.flat_nets.reserve(ctx->nets.size());
+        for (auto &net : ctx->nets) {
+            net.second->udata = g.flat_nets.size();
+            g.flat_nets.push_back(net.second.get());
+        }
+        // Setup per thread context
+        for (int i = 0; i < cfg.threads; i++) {
+            t.emplace_back(ctx, g, i);
+        }
+        // Setup region bounds
+        for (auto &region : ctx->region) {
+            Region *r = region.second.get();
+            NetBB bb;
+            if (r->constr_bels) {
+                bb.x0 = std::numeric_limits<int>::max();
+                bb.x1 = std::numeric_limits<int>::min();
+                bb.y0 = std::numeric_limits<int>::max();
+                bb.y1 = std::numeric_limits<int>::min();
+                for (auto bel : r->bels) {
+                    Loc loc = ctx->getBelLocation(bel);
+                    bb.x0 = std::min(bb.x0, loc.x);
+                    bb.x1 = std::max(bb.x1, loc.x);
+                    bb.y0 = std::min(bb.y0, loc.y);
+                    bb.y1 = std::max(bb.y1, loc.y);
+                }
+            } else {
+                bb.x0 = 0;
+                bb.y0 = 0;
+                bb.x1 = ctx->getGridDimX();
+                bb.y1 = ctx->getGridDimY();
+            }
+            g.region_bounds[r->name] = bb;
+        }
+        // Setup fast bels map
+        pool<IdString> cell_types_in_use;
+        for (auto &cell : ctx->cells) {
+            IdString cell_type = cell.second->type;
+            cell_types_in_use.insert(cell_type);
+        }
+
+        for (auto cell_type : cell_types_in_use) {
+            g.bels.addCellType(cell_type);
+        }
+    };
+    std::vector<Partition> parts;
+    void do_partition()
+    {
+        parts.clear();
+        parts.emplace_back(ctx);
+        bool yaxis = false;
+        while (parts.size() < t.size()) {
+            std::vector<Partition> next(parts.size() * 2);
+            for (size_t i = 0; i < parts.size(); i++) {
+                // Randomly permute pivot every iteration so we get different thread boundaries
+                const float delta = 0.1;
+                float pivot = (0.5 - (delta / 2)) + (delta / 2) * (ctx->rng(10000) / 10000.0f);
+                parts.at(i).split(ctx, yaxis, pivot, next.at(i * 2), next.at(i * 2 + 1));
+            }
+            std::swap(parts, next);
+            yaxis = !yaxis;
+        }
+
+        NPNR_ASSERT(parts.size() == t.size());
+        // TODO: thread pool to make this worthwhile...
+        std::vector<std::thread> workers;
+        for (size_t i = 0; i < t.size(); i++) {
+            workers.emplace_back([i, this]() { t.at(i).set_partition(parts.at(i)); });
+        }
+        for (auto &w : workers)
+            w.join();
+    }
+
+    void update_global_costs()
+    {
+        g.last_bounds.resize(g.flat_nets.size());
+        g.last_tmg_costs.resize(g.flat_nets.size());
+        g.total_wirelen = 0;
+        g.total_timing_cost = 0;
+        for (size_t i = 0; i < g.flat_nets.size(); i++) {
+            NetInfo *ni = g.flat_nets.at(i);
+            if (g.skip_net(ni))
+                continue;
+            g.last_bounds.at(i) = NetBB::compute(ctx, ni);
+            g.total_wirelen += g.last_bounds.at(i).hpwl(g.cfg);
+            if (!g.timing_skip_net(ni)) {
+                auto &tc = g.last_tmg_costs.at(i);
+                tc.resize(ni->users.capacity());
+                for (auto usr : ni->users.enumerate()) {
+                    tc.at(usr.index.idx()) = g.get_timing_cost(ni, usr.index);
+                    g.total_timing_cost += tc.at(usr.index.idx());
+                }
+            }
+        }
+    }
+    void run()
+    {
+
+        ScopeLock<Context> lock(ctx);
+        auto refine_start = std::chrono::high_resolution_clock::now();
+
+        g.tmg.setup_only = true;
+        g.tmg.setup();
+        do_partition();
+        log_info("Running parallel refinement with %d threads.\n", int(t.size()));
+        int iter = 1;
+        bool done = false;
+        update_global_costs();
+        double avg_wirelen = g.total_wirelen;
+        wirelen_t min_wirelen = g.total_wirelen;
+        while (true) {
+            if (iter > 1) {
+                if (g.total_wirelen >= min_wirelen) {
+                    done = true;
+                } else if (g.total_wirelen < min_wirelen) {
+                    min_wirelen = g.total_wirelen;
+                }
+                int n_accept = 0, n_move = 0;
+                for (auto &t_data : t) {
+                    n_accept += t_data.n_accept;
+                    n_move += t_data.n_move;
+                }
+                double r_accept = n_accept / double(n_move);
+                if (g.total_wirelen < (0.95 * avg_wirelen) && g.total_wirelen > 0) {
+                    avg_wirelen = 0.8 * avg_wirelen + 0.2 * g.total_wirelen;
+                } else {
+                    if (r_accept > 0.15 && g.radius > 1) {
+                        g.temperature *= 0.95;
+                    } else {
+                        g.temperature *= 0.8;
+                    }
+                }
+                if ((iter % 10) == 0 && g.radius > 1)
+                    --g.radius;
+            }
+
+            if ((iter == 1) || ((iter % 5) == 0) || done)
+                log_info("  at iteration #%d: temp = %f, timing cost = "
+                         "%.0f, wirelen = %.0f\n",
+                         iter, g.temperature, double(g.total_timing_cost), double(g.total_wirelen));
+
+            if (done)
+                break;
+
+            do_partition();
+
+            std::vector<std::thread> workers;
+            workers.reserve(t.size());
+            for (int j = 0; j < int(t.size()); j++)
+                workers.emplace_back([this, j]() { t.at(j).run_iter(); });
+            for (auto &w : workers)
+                w.join();
+            g.tmg.run();
+            update_global_costs();
+            iter++;
+            ctx->yield();
+        }
+        auto refine_end = std::chrono::high_resolution_clock::now();
+        log_info("Placement refine time %.02fs\n", std::chrono::duration<float>(refine_end - refine_start).count());
+    }
+};
+} // namespace
+
+ParallelRefineCfg::ParallelRefineCfg(Context *ctx)
+{
+    timing_driven = ctx->setting<bool>("timing_driven");
+    threads = ctx->setting<int>("threads", 8);
+    // snap to nearest power of two; and minimum thread size
+    int actual_threads = 1;
+    while ((actual_threads * 2) <= threads && (int(ctx->cells.size()) / (actual_threads * 2)) >= min_thread_size)
+        actual_threads *= 2;
+    threads = actual_threads;
+    hpwl_scale_x = 1;
+    hpwl_scale_y = 1;
+}
+
+bool parallel_refine(Context *ctx, ParallelRefineCfg cfg)
+{
+    // TODO
+    ParallelRefine refine(ctx, cfg);
+    refine.run();
+    timing_analysis(ctx);
+    return true;
+}
+
+NEXTPNR_NAMESPACE_END
+
+#else /* !defined(__wasm) */
+
+NEXTPNR_NAMESPACE_BEGIN
+
+bool parallel_refine(Context *ctx, ParallelRefineCfg cfg) { log_abort(); }
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/place/parallel_refine.h b/common/place/parallel_refine.h
new file mode 100644
index 00000000..556317cd
--- /dev/null
+++ b/common/place/parallel_refine.h
@@ -0,0 +1,43 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef PARALLEL_REFINE_H
+#define PARALLEL_REFINE_H
+
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct ParallelRefineCfg
+{
+    ParallelRefineCfg(Context *ctx);
+    bool timing_driven;
+    int threads;
+    int hpwl_scale_x, hpwl_scale_y;
+    double lambda = 0.5f;
+    float crit_exp = 8;
+    int inner_iters = 15;
+    int min_thread_size = 500;
+};
+
+bool parallel_refine(Context *ctx, ParallelRefineCfg cfg);
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/place/place_common.cc b/common/place/place_common.cc
new file mode 100644
index 00000000..e03fca55
--- /dev/null
+++ b/common/place/place_common.cc
@@ -0,0 +1,487 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "place_common.h"
+#include <cmath>
+#include "log.h"
+#include "util.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+// Get the total estimated wirelength for a net
+wirelen_t get_net_metric(const Context *ctx, const NetInfo *net, MetricType type, float &tns)
+{
+    wirelen_t wirelength = 0;
+    CellInfo *driver_cell = net->driver.cell;
+    if (!driver_cell)
+        return 0;
+    if (driver_cell->bel == BelId())
+        return 0;
+    bool driver_gb = ctx->getBelGlobalBuf(driver_cell->bel);
+    if (driver_gb)
+        return 0;
+    int clock_count;
+    bool timing_driven = ctx->setting<bool>("timing_driven") && type == MetricType::COST &&
+                         ctx->getPortTimingClass(driver_cell, net->driver.port, clock_count) != TMG_IGNORE;
+    delay_t negative_slack = 0;
+    delay_t worst_slack = std::numeric_limits<delay_t>::max();
+    Loc driver_loc = ctx->getBelLocation(driver_cell->bel);
+    int xmin = driver_loc.x, xmax = driver_loc.x, ymin = driver_loc.y, ymax = driver_loc.y;
+    for (auto load : net->users) {
+        if (load.cell == nullptr)
+            continue;
+        CellInfo *load_cell = load.cell;
+        if (load_cell->bel == BelId())
+            continue;
+        if (timing_driven) {
+            delay_t net_delay = ctx->predictArcDelay(net, load);
+            auto slack = load.budget - net_delay;
+            if (slack < 0)
+                negative_slack += slack;
+            worst_slack = std::min(slack, worst_slack);
+        }
+
+        if (ctx->getBelGlobalBuf(load_cell->bel))
+            continue;
+        Loc load_loc = ctx->getBelLocation(load_cell->bel);
+
+        xmin = std::min(xmin, load_loc.x);
+        ymin = std::min(ymin, load_loc.y);
+        xmax = std::max(xmax, load_loc.x);
+        ymax = std::max(ymax, load_loc.y);
+    }
+    if (timing_driven) {
+        wirelength = wirelen_t(
+                (((ymax - ymin) + (xmax - xmin)) * std::min(5.0, (1.0 + std::exp(-ctx->getDelayNS(worst_slack) / 5)))));
+    } else {
+        wirelength = wirelen_t((ymax - ymin) + (xmax - xmin));
+    }
+
+    tns += ctx->getDelayNS(negative_slack);
+    return wirelength;
+}
+
+// Get the total wirelength for a cell
+wirelen_t get_cell_metric(const Context *ctx, const CellInfo *cell, MetricType type)
+{
+    std::set<IdString> nets;
+    for (auto p : cell->ports) {
+        if (p.second.net)
+            nets.insert(p.second.net->name);
+    }
+    wirelen_t wirelength = 0;
+    float tns = 0;
+    for (auto n : nets) {
+        wirelength += get_net_metric(ctx, ctx->nets.at(n).get(), type, tns);
+    }
+    return wirelength;
+}
+
+wirelen_t get_cell_metric_at_bel(const Context *ctx, CellInfo *cell, BelId bel, MetricType type)
+{
+    BelId oldBel = cell->bel;
+    cell->bel = bel;
+    wirelen_t wirelen = get_cell_metric(ctx, cell, type);
+    cell->bel = oldBel;
+    return wirelen;
+}
+
+// Placing a single cell
+bool place_single_cell(Context *ctx, CellInfo *cell, bool require_legality)
+{
+    bool all_placed = false;
+    int iters = 25;
+    while (!all_placed) {
+        BelId best_bel = BelId();
+        wirelen_t best_wirelen = std::numeric_limits<wirelen_t>::max(),
+                  best_ripup_wirelen = std::numeric_limits<wirelen_t>::max();
+        CellInfo *ripup_target = nullptr;
+        BelId ripup_bel = BelId();
+        if (cell->bel != BelId()) {
+            ctx->unbindBel(cell->bel);
+        }
+        IdString targetType = cell->type;
+        for (auto bel : ctx->getBels()) {
+            if (ctx->isValidBelForCellType(targetType, bel)) {
+                if (ctx->checkBelAvail(bel)) {
+                    wirelen_t wirelen = get_cell_metric_at_bel(ctx, cell, bel, MetricType::COST);
+                    if (iters >= 4)
+                        wirelen += ctx->rng(25);
+                    if (wirelen <= best_wirelen) {
+                        best_wirelen = wirelen;
+                        best_bel = bel;
+                    }
+                } else {
+                    wirelen_t wirelen = get_cell_metric_at_bel(ctx, cell, bel, MetricType::COST);
+                    if (iters >= 4)
+                        wirelen += ctx->rng(25);
+                    if (wirelen <= best_ripup_wirelen) {
+                        CellInfo *curr_cell = ctx->getBoundBelCell(bel);
+                        if (curr_cell->belStrength < STRENGTH_STRONG) {
+                            best_ripup_wirelen = wirelen;
+                            ripup_bel = bel;
+                            ripup_target = curr_cell;
+                        }
+                    }
+                }
+            }
+        }
+        if (best_bel == BelId()) {
+            if (iters == 0) {
+                log_error("failed to place cell '%s' of type '%s' (ripup iteration limit exceeded)\n",
+                          cell->name.c_str(ctx), cell->type.c_str(ctx));
+            }
+            if (ripup_bel == BelId()) {
+                log_error("failed to place cell '%s' of type '%s'\n", cell->name.c_str(ctx), cell->type.c_str(ctx));
+            }
+            --iters;
+            ctx->unbindBel(ripup_target->bel);
+            best_bel = ripup_bel;
+        } else {
+            ripup_target = nullptr;
+            all_placed = true;
+        }
+        ctx->bindBel(best_bel, cell, STRENGTH_WEAK);
+        if (require_legality && !ctx->isBelLocationValid(best_bel)) {
+            ctx->unbindBel(best_bel);
+            if (ripup_target != nullptr) {
+                ctx->bindBel(best_bel, ripup_target, STRENGTH_WEAK);
+            }
+            all_placed = false;
+            continue;
+        }
+        if (ctx->verbose)
+            log_info("   placed single cell '%s' at '%s'\n", cell->name.c_str(ctx), ctx->nameOfBel(best_bel));
+        cell = ripup_target;
+    }
+    return true;
+}
+
+class ConstraintLegaliseWorker
+{
+  private:
+    Context *ctx;
+    std::set<IdString> rippedCells;
+    dict<IdString, Loc> oldLocations;
+    dict<ClusterId, std::vector<CellInfo *>> cluster2cells;
+
+    class IncreasingDiameterSearch
+    {
+      public:
+        IncreasingDiameterSearch() : start(0), min(0), max(-1){};
+        IncreasingDiameterSearch(int x) : start(x), min(x), max(x){};
+        IncreasingDiameterSearch(int start, int min, int max) : start(start), min(min), max(max){};
+        bool done() const { return (diameter > (max - min)); };
+        int get() const
+        {
+            int val = start + sign * diameter;
+            val = std::max(val, min);
+            val = std::min(val, max);
+            return val;
+        }
+
+        void next()
+        {
+            if (sign == 0) {
+                sign = 1;
+                diameter = 1;
+            } else if (sign == -1) {
+                sign = 1;
+                if ((start + sign * diameter) > max)
+                    sign = -1;
+                ++diameter;
+            } else {
+                sign = -1;
+                if ((start + sign * diameter) < min) {
+                    sign = 1;
+                    ++diameter;
+                }
+            }
+        }
+
+        void reset()
+        {
+            sign = 0;
+            diameter = 0;
+        }
+
+      private:
+        int start, min, max;
+        int diameter = 0;
+        int sign = 0;
+    };
+
+    typedef dict<IdString, Loc> CellLocations;
+
+    // Check if a location would be suitable for a cell and all its constrained children
+    bool valid_loc_for(const CellInfo *cell, Loc loc, CellLocations &solution, pool<Loc> &usedLocations)
+    {
+        BelId locBel = ctx->getBelByLocation(loc);
+        if (locBel == BelId())
+            return false;
+
+        if (cell->cluster == ClusterId()) {
+            if (!ctx->isValidBelForCellType(cell->type, locBel))
+                return false;
+            if (!ctx->checkBelAvail(locBel)) {
+                CellInfo *confCell = ctx->getConflictingBelCell(locBel);
+                if (confCell->belStrength >= STRENGTH_STRONG) {
+                    return false;
+                }
+            }
+            // Don't place at tiles where any strongly bound Bels exist, as we might need to rip them up later
+            for (auto tilebel : ctx->getBelsByTile(loc.x, loc.y)) {
+                CellInfo *tcell = ctx->getBoundBelCell(tilebel);
+                if (tcell && tcell->belStrength >= STRENGTH_STRONG)
+                    return false;
+            }
+            usedLocations.insert(loc);
+            solution[cell->name] = loc;
+        } else {
+            std::vector<std::pair<CellInfo *, BelId>> placement;
+            if (!ctx->getClusterPlacement(cell->cluster, locBel, placement))
+                return false;
+            for (auto &p : placement) {
+                Loc p_loc = ctx->getBelLocation(p.second);
+                if (!ctx->checkBelAvail(p.second)) {
+                    CellInfo *confCell = ctx->getConflictingBelCell(p.second);
+                    if (confCell->belStrength >= STRENGTH_STRONG) {
+                        return false;
+                    }
+                }
+                // Don't place at tiles where any strongly bound Bels exist, as we might need to rip them up later
+                for (auto tilebel : ctx->getBelsByTile(p_loc.x, p_loc.y)) {
+                    CellInfo *tcell = ctx->getBoundBelCell(tilebel);
+                    if (tcell && tcell->belStrength >= STRENGTH_STRONG)
+                        return false;
+                }
+                usedLocations.insert(p_loc);
+                solution[p.first->name] = p_loc;
+            }
+        }
+
+        return true;
+    }
+
+    // Set the strength to locked on all cells in chain
+    void lockdown_chain(CellInfo *root)
+    {
+        root->belStrength = STRENGTH_STRONG;
+        if (root->cluster != ClusterId())
+            for (auto child : cluster2cells.at(root->cluster))
+                child->belStrength = STRENGTH_STRONG;
+    }
+
+    // Legalise placement constraints on a cell
+    bool legalise_cell(CellInfo *cell)
+    {
+        if (cell->cluster != ClusterId() && ctx->getClusterRootCell(cell->cluster) != cell)
+            return true; // Only process chain roots
+        if (constraints_satisfied(cell)) {
+            if (cell->cluster != ClusterId())
+                lockdown_chain(cell);
+        } else {
+            IncreasingDiameterSearch xRootSearch, yRootSearch, zRootSearch;
+            Loc currentLoc;
+            if (cell->bel != BelId())
+                currentLoc = ctx->getBelLocation(cell->bel);
+            else
+                currentLoc = oldLocations[cell->name];
+            xRootSearch = IncreasingDiameterSearch(currentLoc.x, 0, ctx->getGridDimX() - 1);
+            yRootSearch = IncreasingDiameterSearch(currentLoc.y, 0, ctx->getGridDimY() - 1);
+            zRootSearch = IncreasingDiameterSearch(currentLoc.z, 0, ctx->getTileBelDimZ(currentLoc.x, currentLoc.y));
+
+            while (!xRootSearch.done()) {
+                Loc rootLoc;
+
+                rootLoc.x = xRootSearch.get();
+                rootLoc.y = yRootSearch.get();
+                rootLoc.z = zRootSearch.get();
+                zRootSearch.next();
+                if (zRootSearch.done()) {
+                    zRootSearch.reset();
+                    yRootSearch.next();
+                    if (yRootSearch.done()) {
+                        yRootSearch.reset();
+                        xRootSearch.next();
+                    }
+                }
+
+                CellLocations solution;
+                pool<Loc> used;
+                if (valid_loc_for(cell, rootLoc, solution, used)) {
+                    for (auto cp : solution) {
+                        // First unbind all cells
+                        if (ctx->cells.at(cp.first)->bel != BelId())
+                            ctx->unbindBel(ctx->cells.at(cp.first)->bel);
+                    }
+                    for (auto cp : solution) {
+                        if (ctx->verbose)
+                            log_info("     placing '%s' at (%d, %d, %d)\n", cp.first.c_str(ctx), cp.second.x,
+                                     cp.second.y, cp.second.z);
+                        BelId target = ctx->getBelByLocation(cp.second);
+                        if (!ctx->checkBelAvail(target)) {
+                            CellInfo *confl_cell = ctx->getConflictingBelCell(target);
+                            if (confl_cell != nullptr) {
+                                if (ctx->verbose)
+                                    log_info("       '%s' already placed at '%s'\n", ctx->nameOf(confl_cell),
+                                             ctx->nameOfBel(confl_cell->bel));
+                                NPNR_ASSERT(confl_cell->belStrength < STRENGTH_STRONG);
+                                ctx->unbindBel(target);
+                                rippedCells.insert(confl_cell->name);
+                            }
+                        }
+                        ctx->bindBel(target, ctx->cells.at(cp.first).get(), STRENGTH_STRONG);
+                        rippedCells.erase(cp.first);
+                    }
+                    for (auto cp : solution) {
+                        for (auto bel : ctx->getBelsByTile(cp.second.x, cp.second.y)) {
+                            CellInfo *belCell = ctx->getBoundBelCell(bel);
+                            if (belCell != nullptr && !solution.count(belCell->name)) {
+                                if (!ctx->isBelLocationValid(bel)) {
+                                    NPNR_ASSERT(belCell->belStrength < STRENGTH_STRONG);
+                                    ctx->unbindBel(bel);
+                                    rippedCells.insert(belCell->name);
+                                }
+                            }
+                        }
+                    }
+                    NPNR_ASSERT(constraints_satisfied(cell));
+                    return true;
+                }
+            }
+            return false;
+        }
+        return true;
+    }
+
+    // Check if constraints are currently satisfied on a cell and its children
+    bool constraints_satisfied(const CellInfo *cell) { return get_constraints_distance(ctx, cell) == 0; }
+
+  public:
+    ConstraintLegaliseWorker(Context *ctx) : ctx(ctx)
+    {
+        for (auto &cell : ctx->cells) {
+            if (cell.second->cluster != ClusterId())
+                cluster2cells[cell.second->cluster].push_back(cell.second.get());
+        }
+    };
+
+    unsigned print_stats(const char *point)
+    {
+        float distance_sum = 0;
+        float max_distance = 0;
+        unsigned moved_cells = 0;
+        unsigned unplaced_cells = 0;
+        for (auto orig : oldLocations) {
+            if (ctx->cells.at(orig.first)->bel == BelId()) {
+                unplaced_cells++;
+                continue;
+            }
+            Loc newLoc = ctx->getBelLocation(ctx->cells.at(orig.first)->bel);
+            if (newLoc != orig.second) {
+                float distance = std::sqrt(std::pow(newLoc.x - orig.second.x, 2) + pow(newLoc.y - orig.second.y, 2));
+                moved_cells++;
+                distance_sum += distance;
+                if (distance > max_distance)
+                    max_distance = distance;
+            }
+        }
+        log_info("    moved %d cells, %d unplaced (after %s)\n", moved_cells, unplaced_cells, point);
+        if (moved_cells > 0) {
+            log_info("       average distance %f\n", (distance_sum / moved_cells));
+            log_info("       maximum distance %f\n", max_distance);
+        }
+        return moved_cells + unplaced_cells;
+    }
+
+    int legalise_constraints()
+    {
+        log_info("Legalising relative constraints...\n");
+        for (auto &cell : ctx->cells) {
+            oldLocations[cell.first] = ctx->getBelLocation(cell.second->bel);
+        }
+        for (auto &cell : ctx->cells) {
+            bool res = legalise_cell(cell.second.get());
+            if (!res) {
+                log_error("failed to place chain starting at cell '%s'\n", cell.first.c_str(ctx));
+                return -1;
+            }
+        }
+        if (print_stats("legalising chains") == 0)
+            return 0;
+        for (auto rippedCell : rippedCells) {
+            bool res = place_single_cell(ctx, ctx->cells.at(rippedCell).get(), true);
+            if (!res) {
+                log_error("failed to place cell '%s' after relative constraint legalisation\n", rippedCell.c_str(ctx));
+                return -1;
+            }
+        }
+        auto score = print_stats("replacing ripped up cells");
+        for (auto &cell : ctx->cells)
+            if (get_constraints_distance(ctx, cell.second.get()) != 0)
+                log_error("constraint satisfaction check failed for cell '%s' at Bel '%s'\n", cell.first.c_str(ctx),
+                          ctx->nameOfBel(cell.second->bel));
+        return score;
+    }
+};
+
+bool legalise_relative_constraints(Context *ctx) { return ConstraintLegaliseWorker(ctx).legalise_constraints() > 0; }
+
+// Get the total distance from satisfied constraints for a cell
+int get_constraints_distance(const Context *ctx, const CellInfo *cell)
+{
+    int dist = 0;
+    if (cell->bel == BelId())
+        return 100000;
+    Loc loc = ctx->getBelLocation(cell->bel);
+
+    if (cell->cluster != ClusterId()) {
+        CellInfo *root = ctx->getClusterRootCell(cell->cluster);
+        if (root == cell) {
+            // parent
+            std::vector<std::pair<CellInfo *, BelId>> placement;
+            if (!ctx->getClusterPlacement(cell->cluster, cell->bel, placement)) {
+                return 100000;
+            } else {
+                for (const auto &p : placement) {
+                    if (p.first->bel == BelId())
+                        return 100000;
+                    Loc c_loc = ctx->getBelLocation(p.first->bel);
+                    Loc p_loc = ctx->getBelLocation(p.second);
+                    dist += std::abs(c_loc.x - p_loc.x);
+                    dist += std::abs(c_loc.y - p_loc.y);
+                    dist += std::abs(c_loc.z - p_loc.z);
+                }
+            }
+        } else {
+            // child
+            if (root->bel == BelId())
+                return 100000;
+            Loc root_loc = ctx->getBelLocation(root->bel);
+            Loc offset = ctx->getClusterOffset(cell);
+            dist += std::abs((root_loc.x + offset.x) - loc.x);
+            dist += std::abs((root_loc.y + offset.y) - loc.y);
+        }
+    }
+
+    return dist;
+}
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/place_common.h b/common/place/place_common.h
new file mode 100644
index 00000000..5e5cbee3
--- /dev/null
+++ b/common/place/place_common.h
@@ -0,0 +1,55 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef PLACE_COMMON_H
+#define PLACE_COMMON_H
+
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+typedef int64_t wirelen_t;
+
+enum class MetricType
+{
+    COST,
+    WIRELENGTH
+};
+
+// Return the wirelength of a net
+wirelen_t get_net_metric(const Context *ctx, const NetInfo *net, MetricType type, float &tns);
+
+// Return the wirelength of all nets connected to a cell
+wirelen_t get_cell_metric(const Context *ctx, const CellInfo *cell, MetricType type);
+
+// Return the wirelength of all nets connected to a cell, when the cell is at a given bel
+wirelen_t get_cell_metric_at_bel(const Context *ctx, CellInfo *cell, BelId bel, MetricType type);
+
+// Place a single cell in the lowest wirelength Bel available, optionally requiring validity check
+bool place_single_cell(Context *ctx, CellInfo *cell, bool require_legality);
+
+// Modify a design s.t. all relative placement constraints are satisfied
+bool legalise_relative_constraints(Context *ctx);
+
+// Get the total distance from satisfied constraints for a cell
+int get_constraints_distance(const Context *ctx, const CellInfo *cell);
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/place/placer1.cc b/common/place/placer1.cc
new file mode 100644
index 00000000..a6ba3895
--- /dev/null
+++ b/common/place/placer1.cc
@@ -0,0 +1,1317 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  Claire Xenia Wolf <claire@yosyshq.com>
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Simulated annealing implementation based on arachne-pnr
+ *  Copyright (C) 2015-2018 Cotton Seed
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "placer1.h"
+#include <algorithm>
+#include <boost/lexical_cast.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>
+#include <queue>
+#include <set>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "fast_bels.h"
+#include "log.h"
+#include "place_common.h"
+#include "scope_lock.h"
+#include "timing.h"
+#include "util.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+class SAPlacer
+{
+  private:
+    struct BoundingBox
+    {
+        // Actual bounding box
+        int x0 = 0, x1 = 0, y0 = 0, y1 = 0;
+        // Number of cells at each extremity
+        int nx0 = 0, nx1 = 0, ny0 = 0, ny1 = 0;
+        wirelen_t hpwl(const Placer1Cfg &cfg) const
+        {
+            return wirelen_t(cfg.hpwl_scale_x * (x1 - x0) + cfg.hpwl_scale_y * (y1 - y0));
+        }
+    };
+
+  public:
+    SAPlacer(Context *ctx, Placer1Cfg cfg)
+            : ctx(ctx), fast_bels(ctx, /*check_bel_available=*/false, cfg.minBelsForGridPick), cfg(cfg), tmg(ctx)
+    {
+        for (auto bel : ctx->getBels()) {
+            Loc loc = ctx->getBelLocation(bel);
+            max_x = std::max(max_x, loc.x);
+            max_y = std::max(max_y, loc.y);
+        }
+        diameter = std::max(max_x, max_y) + 1;
+
+        pool<IdString> cell_types_in_use;
+        for (auto &cell : ctx->cells) {
+            IdString cell_type = cell.second->type;
+            cell_types_in_use.insert(cell_type);
+        }
+
+        for (auto cell_type : cell_types_in_use) {
+            fast_bels.addCellType(cell_type);
+        }
+
+        net_bounds.resize(ctx->nets.size());
+        net_arc_tcost.resize(ctx->nets.size());
+        old_udata.reserve(ctx->nets.size());
+        net_by_udata.reserve(ctx->nets.size());
+        decltype(NetInfo::udata) n = 0;
+        for (auto &net : ctx->nets) {
+            old_udata.emplace_back(net.second->udata);
+            net_arc_tcost.at(n).resize(net.second->users.capacity());
+            net.second->udata = n++;
+            net_by_udata.push_back(net.second.get());
+        }
+        for (auto &region : ctx->region) {
+            Region *r = region.second.get();
+            BoundingBox bb;
+            if (r->constr_bels) {
+                bb.x0 = std::numeric_limits<int>::max();
+                bb.x1 = std::numeric_limits<int>::min();
+                bb.y0 = std::numeric_limits<int>::max();
+                bb.y1 = std::numeric_limits<int>::min();
+                for (auto bel : r->bels) {
+                    Loc loc = ctx->getBelLocation(bel);
+                    bb.x0 = std::min(bb.x0, loc.x);
+                    bb.x1 = std::max(bb.x1, loc.x);
+                    bb.y0 = std::min(bb.y0, loc.y);
+                    bb.y1 = std::max(bb.y1, loc.y);
+                }
+            } else {
+                bb.x0 = 0;
+                bb.y0 = 0;
+                bb.x1 = max_x;
+                bb.y1 = max_y;
+            }
+            region_bounds[r->name] = bb;
+        }
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            if (ci->cluster == ClusterId())
+                continue;
+            cluster2cell[ci->cluster].push_back(ci);
+        }
+    }
+
+    ~SAPlacer()
+    {
+        for (auto &net : ctx->nets)
+            net.second->udata = old_udata[net.second->udata];
+    }
+
+    bool place(bool refine = false)
+    {
+        log_break();
+
+        ScopeLock<Context> lock(ctx);
+
+        size_t placed_cells = 0;
+        std::vector<CellInfo *> autoplaced;
+        std::vector<CellInfo *> chain_basis;
+        if (!refine) {
+            // Initial constraints placer
+            for (auto &cell_entry : ctx->cells) {
+                CellInfo *cell = cell_entry.second.get();
+                auto loc = cell->attrs.find(ctx->id("BEL"));
+                if (loc != cell->attrs.end()) {
+                    std::string loc_name = loc->second.as_string();
+                    BelId bel = ctx->getBelByNameStr(loc_name);
+                    if (bel == BelId()) {
+                        log_error("No Bel named \'%s\' located for "
+                                  "this chip (processing BEL attribute on \'%s\')\n",
+                                  loc_name.c_str(), cell->name.c_str(ctx));
+                    }
+
+                    if (!ctx->isValidBelForCellType(cell->type, bel)) {
+                        IdString bel_type = ctx->getBelType(bel);
+                        log_error("Bel \'%s\' of type \'%s\' does not match cell "
+                                  "\'%s\' of type \'%s\'\n",
+                                  loc_name.c_str(), bel_type.c_str(ctx), cell->name.c_str(ctx), cell->type.c_str(ctx));
+                    }
+                    auto bound_cell = ctx->getBoundBelCell(bel);
+                    if (bound_cell) {
+                        log_error(
+                                "Cell \'%s\' cannot be bound to bel \'%s\' since it is already bound to cell \'%s\'\n",
+                                cell->name.c_str(ctx), loc_name.c_str(), bound_cell->name.c_str(ctx));
+                    }
+
+                    ctx->bindBel(bel, cell, STRENGTH_USER);
+                    if (!ctx->isBelLocationValid(bel)) {
+                        IdString bel_type = ctx->getBelType(bel);
+                        log_error("Bel \'%s\' of type \'%s\' is not valid for cell "
+                                  "\'%s\' of type \'%s\'\n",
+                                  loc_name.c_str(), bel_type.c_str(ctx), cell->name.c_str(ctx), cell->type.c_str(ctx));
+                    }
+                    locked_bels.insert(bel);
+                    placed_cells++;
+                }
+            }
+            int constr_placed_cells = placed_cells;
+            log_info("Placed %d cells based on constraints.\n", int(placed_cells));
+            ctx->yield();
+
+            // Sort to-place cells for deterministic initial placement
+
+            for (auto &cell : ctx->cells) {
+                CellInfo *ci = cell.second.get();
+                if (ci->bel == BelId()) {
+                    autoplaced.push_back(cell.second.get());
+                }
+            }
+            std::sort(autoplaced.begin(), autoplaced.end(), [](CellInfo *a, CellInfo *b) { return a->name < b->name; });
+            ctx->shuffle(autoplaced);
+            auto iplace_start = std::chrono::high_resolution_clock::now();
+            // Place cells randomly initially
+            log_info("Creating initial placement for remaining %d cells.\n", int(autoplaced.size()));
+
+            for (auto cell : autoplaced) {
+                place_initial(cell);
+                placed_cells++;
+                if ((placed_cells - constr_placed_cells) % 500 == 0)
+                    log_info("  initial placement placed %d/%d cells\n", int(placed_cells - constr_placed_cells),
+                             int(autoplaced.size()));
+            }
+            if ((placed_cells - constr_placed_cells) % 500 != 0)
+                log_info("  initial placement placed %d/%d cells\n", int(placed_cells - constr_placed_cells),
+                         int(autoplaced.size()));
+            if (cfg.budgetBased && cfg.slack_redist_iter > 0)
+                assign_budget(ctx);
+            ctx->yield();
+            auto iplace_end = std::chrono::high_resolution_clock::now();
+            log_info("Initial placement time %.02fs\n",
+                     std::chrono::duration<float>(iplace_end - iplace_start).count());
+            log_info("Running simulated annealing placer.\n");
+        } else {
+            for (auto &cell : ctx->cells) {
+                CellInfo *ci = cell.second.get();
+                if (ci->belStrength > STRENGTH_STRONG) {
+                    continue;
+                } else if (ci->cluster != ClusterId()) {
+                    if (ctx->getClusterRootCell(ci->cluster) == ci)
+                        chain_basis.push_back(ci);
+                    else
+                        continue;
+                } else {
+                    autoplaced.push_back(ci);
+                }
+            }
+            require_legal = false;
+            diameter = 3;
+            log_info("Running simulated annealing placer for refinement.\n");
+        }
+        auto saplace_start = std::chrono::high_resolution_clock::now();
+
+        // Invoke timing analysis to obtain criticalities
+        tmg.setup_only = true;
+        if (!cfg.budgetBased)
+            tmg.setup();
+
+        // Calculate costs after initial placement
+        setup_costs();
+        moveChange.init(this);
+        curr_wirelen_cost = total_wirelen_cost();
+        curr_timing_cost = total_timing_cost();
+        last_wirelen_cost = curr_wirelen_cost;
+        last_timing_cost = curr_timing_cost;
+
+        if (cfg.netShareWeight > 0)
+            setup_nets_by_tile();
+
+        wirelen_t avg_wirelen = curr_wirelen_cost;
+        wirelen_t min_wirelen = curr_wirelen_cost;
+
+        int n_no_progress = 0;
+        temp = refine ? 1e-7 : cfg.startTemp;
+
+        // Main simulated annealing loop
+        for (int iter = 1;; iter++) {
+            n_move = n_accept = 0;
+            improved = false;
+
+            if (iter % 5 == 0 || iter == 1)
+                log_info("  at iteration #%d: temp = %f, timing cost = "
+                         "%.0f, wirelen = %.0f\n",
+                         iter, temp, double(curr_timing_cost), double(curr_wirelen_cost));
+
+            for (int m = 0; m < 15; ++m) {
+                // Loop through all automatically placed cells
+                for (auto cell : autoplaced) {
+                    // Find another random Bel for this cell
+                    BelId try_bel = random_bel_for_cell(cell);
+                    // If valid, try and swap to a new position and see if
+                    // the new position is valid/worthwhile
+                    if (try_bel != BelId() && try_bel != cell->bel)
+                        try_swap_position(cell, try_bel);
+                }
+                // Also try swapping chains, if applicable
+                for (auto cb : chain_basis) {
+                    Loc chain_base_loc = ctx->getBelLocation(cb->bel);
+                    BelId try_base = random_bel_for_cell(cb, chain_base_loc.z);
+                    if (try_base != BelId() && try_base != cb->bel)
+                        try_swap_chain(cb, try_base);
+                }
+            }
+
+            if (ctx->debug) {
+                // Verify correctness of incremental wirelen updates
+                for (size_t i = 0; i < net_bounds.size(); i++) {
+                    auto net = net_by_udata[i];
+                    if (ignore_net(net))
+                        continue;
+                    auto &incr = net_bounds.at(i), gold = get_net_bounds(net);
+                    NPNR_ASSERT(incr.x0 == gold.x0);
+                    NPNR_ASSERT(incr.x1 == gold.x1);
+                    NPNR_ASSERT(incr.y0 == gold.y0);
+                    NPNR_ASSERT(incr.y1 == gold.y1);
+                    NPNR_ASSERT(incr.nx0 == gold.nx0);
+                    NPNR_ASSERT(incr.nx1 == gold.nx1);
+                    NPNR_ASSERT(incr.ny0 == gold.ny0);
+                    NPNR_ASSERT(incr.ny1 == gold.ny1);
+                }
+            }
+
+            if (curr_wirelen_cost < min_wirelen) {
+                min_wirelen = curr_wirelen_cost;
+                improved = true;
+            }
+
+            // Heuristic to improve placement on the 8k
+            if (improved)
+                n_no_progress = 0;
+            else
+                n_no_progress++;
+
+            if (temp <= 1e-7 && n_no_progress >= (refine ? 1 : 5)) {
+                log_info("  at iteration #%d: temp = %f, timing cost = "
+                         "%.0f, wirelen = %.0f \n",
+                         iter, temp, double(curr_timing_cost), double(curr_wirelen_cost));
+                break;
+            }
+
+            double Raccept = double(n_accept) / double(n_move);
+
+            int M = std::max(max_x, max_y) + 1;
+
+            if (ctx->verbose)
+                log("iter #%d: temp = %f, timing cost = "
+                    "%.0f, wirelen = %.0f, dia = %d, Ra = %.02f \n",
+                    iter, temp, double(curr_timing_cost), double(curr_wirelen_cost), diameter, Raccept);
+
+            if (curr_wirelen_cost < 0.95 * avg_wirelen && curr_wirelen_cost > 0) {
+                avg_wirelen = 0.8 * avg_wirelen + 0.2 * curr_wirelen_cost;
+            } else {
+                double diam_next = diameter * (1.0 - 0.44 + Raccept);
+                diameter = std::max<int>(1, std::min<int>(M, int(diam_next + 0.5)));
+                if (Raccept > 0.96) {
+                    temp *= 0.5;
+                } else if (Raccept > 0.8) {
+                    temp *= 0.9;
+                } else if (Raccept > 0.15 && diameter > 1) {
+                    temp *= 0.95;
+                } else {
+                    temp *= 0.8;
+                }
+            }
+            // Once cooled below legalise threshold, run legalisation and start requiring
+            // legal moves only
+            if (diameter < legalise_dia && require_legal) {
+                if (legalise_relative_constraints(ctx)) {
+                    // Only increase temperature if something was moved
+                    autoplaced.clear();
+                    chain_basis.clear();
+                    for (auto &cell : ctx->cells) {
+                        if (cell.second->belStrength <= STRENGTH_STRONG && cell.second->cluster != ClusterId() &&
+                            ctx->getClusterRootCell(cell.second->cluster) == cell.second.get())
+                            chain_basis.push_back(cell.second.get());
+                        else if (cell.second->belStrength < STRENGTH_STRONG)
+                            autoplaced.push_back(cell.second.get());
+                    }
+                    // temp = post_legalise_temp;
+                    // diameter = std::min<int>(M, diameter * post_legalise_dia_scale);
+                    ctx->shuffle(autoplaced);
+
+                    // Legalisation is a big change so force a slack redistribution here
+                    if (cfg.slack_redist_iter > 0 && cfg.budgetBased)
+                        assign_budget(ctx, true /* quiet */);
+                }
+                require_legal = false;
+            } else if (cfg.budgetBased && cfg.slack_redist_iter > 0 && iter % cfg.slack_redist_iter == 0) {
+                assign_budget(ctx, true /* quiet */);
+            }
+
+            // Invoke timing analysis to obtain criticalities
+            if (!cfg.budgetBased && cfg.timing_driven)
+                tmg.run();
+            // Need to rebuild costs after criticalities change
+            setup_costs();
+            // Reset incremental bounds
+            moveChange.reset(this);
+            moveChange.new_net_bounds = net_bounds;
+
+            // Recalculate total metric entirely to avoid rounding errors
+            // accumulating over time
+            curr_wirelen_cost = total_wirelen_cost();
+            curr_timing_cost = total_timing_cost();
+            last_wirelen_cost = curr_wirelen_cost;
+            last_timing_cost = curr_timing_cost;
+            // Let the UI show visualization updates.
+            ctx->yield();
+        }
+
+        auto saplace_end = std::chrono::high_resolution_clock::now();
+        log_info("SA placement time %.02fs\n", std::chrono::duration<float>(saplace_end - saplace_start).count());
+
+        // Final post-placement validity check
+        ctx->yield();
+        for (auto bel : ctx->getBels()) {
+            CellInfo *cell = ctx->getBoundBelCell(bel);
+            if (!ctx->isBelLocationValid(bel)) {
+                std::string cell_text = "no cell";
+                if (cell != nullptr)
+                    cell_text = std::string("cell '") + ctx->nameOf(cell) + "'";
+                if (ctx->force) {
+                    log_warning("post-placement validity check failed for Bel '%s' "
+                                "(%s)\n",
+                                ctx->nameOfBel(bel), cell_text.c_str());
+                } else {
+                    log_error("post-placement validity check failed for Bel '%s' "
+                              "(%s)\n",
+                              ctx->nameOfBel(bel), cell_text.c_str());
+                }
+            }
+        }
+        timing_analysis(ctx);
+
+        return true;
+    }
+
+  private:
+    // Initial random placement
+    void place_initial(CellInfo *cell)
+    {
+        bool all_placed = false;
+        int iters = 25;
+        while (!all_placed) {
+            BelId best_bel = BelId();
+            uint64_t best_score = std::numeric_limits<uint64_t>::max(),
+                     best_ripup_score = std::numeric_limits<uint64_t>::max();
+            CellInfo *ripup_target = nullptr;
+            BelId ripup_bel = BelId();
+            if (cell->bel != BelId()) {
+                ctx->unbindBel(cell->bel);
+            }
+            IdString targetType = cell->type;
+
+            auto proc_bel = [&](BelId bel) {
+                if (ctx->isValidBelForCellType(targetType, bel)) {
+                    if (ctx->checkBelAvail(bel)) {
+                        uint64_t score = ctx->rng64();
+                        if (score <= best_score) {
+                            best_score = score;
+                            best_bel = bel;
+                        }
+                    } else {
+                        uint64_t score = ctx->rng64();
+                        CellInfo *bound_cell = ctx->getBoundBelCell(bel);
+                        if (score <= best_ripup_score && bound_cell->belStrength < STRENGTH_STRONG) {
+                            best_ripup_score = score;
+                            ripup_target = bound_cell;
+                            ripup_bel = bel;
+                        }
+                    }
+                }
+            };
+
+            if (cell->region != nullptr && cell->region->constr_bels) {
+                for (auto bel : cell->region->bels) {
+                    proc_bel(bel);
+                }
+            } else {
+                for (auto bel : ctx->getBels()) {
+                    proc_bel(bel);
+                }
+            }
+
+            if (best_bel == BelId()) {
+                if (iters == 0 || ripup_bel == BelId())
+                    log_error("failed to place cell '%s' of type '%s'\n", cell->name.c_str(ctx), cell->type.c_str(ctx));
+                --iters;
+                ctx->unbindBel(ripup_target->bel);
+                best_bel = ripup_bel;
+            } else {
+                ripup_target = nullptr;
+                all_placed = true;
+            }
+            ctx->bindBel(best_bel, cell, STRENGTH_WEAK);
+
+            if (!ctx->isBelLocationValid(best_bel)) {
+                ctx->unbindBel(best_bel);
+                if (ripup_target != nullptr) {
+                    ctx->bindBel(best_bel, ripup_target, STRENGTH_WEAK);
+                }
+                all_placed = false;
+                continue;
+            }
+
+            // Back annotate location
+            cell->attrs[ctx->id("BEL")] = ctx->getBelName(cell->bel).str(ctx);
+            cell = ripup_target;
+        }
+    }
+
+    // Attempt a SA position swap, return true on success or false on failure
+    bool try_swap_position(CellInfo *cell, BelId newBel)
+    {
+        static const double epsilon = 1e-20;
+        moveChange.reset(this);
+        if (!require_legal && cell->cluster != ClusterId())
+            return false;
+        BelId oldBel = cell->bel;
+        CellInfo *other_cell = ctx->getBoundBelCell(newBel);
+        if (!require_legal && other_cell != nullptr &&
+            (other_cell->cluster != ClusterId() || other_cell->belStrength > STRENGTH_WEAK)) {
+            return false;
+        }
+        int old_dist = get_constraints_distance(ctx, cell);
+        int new_dist;
+        if (other_cell != nullptr)
+            old_dist += get_constraints_distance(ctx, other_cell);
+        double delta = 0;
+
+        if (!ctx->isValidBelForCellType(cell->type, newBel)) {
+            return false;
+        }
+        if (other_cell != nullptr && !ctx->isValidBelForCellType(other_cell->type, oldBel)) {
+            return false;
+        }
+
+        int net_delta_score = 0;
+        if (cfg.netShareWeight > 0)
+            net_delta_score += update_nets_by_tile(cell, ctx->getBelLocation(cell->bel), ctx->getBelLocation(newBel));
+
+        ctx->unbindBel(oldBel);
+        if (other_cell != nullptr) {
+            ctx->unbindBel(newBel);
+        }
+
+        ctx->bindBel(newBel, cell, STRENGTH_WEAK);
+
+        if (other_cell != nullptr) {
+            ctx->bindBel(oldBel, other_cell, STRENGTH_WEAK);
+            if (cfg.netShareWeight > 0)
+                net_delta_score +=
+                        update_nets_by_tile(other_cell, ctx->getBelLocation(newBel), ctx->getBelLocation(oldBel));
+        }
+
+        add_move_cell(moveChange, cell, oldBel);
+
+        if (other_cell != nullptr) {
+            add_move_cell(moveChange, other_cell, newBel);
+        }
+
+        // Always check both the new and old locations; as in some cases of dedicated routing ripping up a cell can deny
+        // use of a dedicated path and thus make a site illegal
+        if (!ctx->isBelLocationValid(newBel) || !ctx->isBelLocationValid(oldBel)) {
+            ctx->unbindBel(newBel);
+            if (other_cell != nullptr)
+                ctx->unbindBel(oldBel);
+            goto swap_fail;
+        }
+
+        // Recalculate metrics for all nets touched by the perturbation
+        compute_cost_changes(moveChange);
+
+        new_dist = get_constraints_distance(ctx, cell);
+        if (other_cell != nullptr)
+            new_dist += get_constraints_distance(ctx, other_cell);
+        delta = lambda * (moveChange.timing_delta / std::max<double>(last_timing_cost, epsilon)) +
+                (1 - lambda) * (double(moveChange.wirelen_delta) / std::max<double>(last_wirelen_cost, epsilon));
+        delta += (cfg.constraintWeight / temp) * (new_dist - old_dist) / last_wirelen_cost;
+        if (cfg.netShareWeight > 0)
+            delta += -cfg.netShareWeight * (net_delta_score / std::max<double>(total_net_share, epsilon));
+        n_move++;
+        // SA acceptance criteria
+        if (delta < 0 || (temp > 1e-8 && (ctx->rng() / float(0x3fffffff)) <= std::exp(-delta / temp))) {
+            n_accept++;
+        } else {
+            if (other_cell != nullptr)
+                ctx->unbindBel(oldBel);
+            ctx->unbindBel(newBel);
+            goto swap_fail;
+        }
+        commit_cost_changes(moveChange);
+#if 0
+        log_info("swap %s -> %s\n", cell->name.c_str(ctx), ctx->nameOfBel(newBel));
+        if (other_cell != nullptr)
+            log_info("swap %s -> %s\n", other_cell->name.c_str(ctx), ctx->nameOfBel(oldBel));
+#endif
+        return true;
+    swap_fail:
+        ctx->bindBel(oldBel, cell, STRENGTH_WEAK);
+        if (other_cell != nullptr) {
+            ctx->bindBel(newBel, other_cell, STRENGTH_WEAK);
+            if (cfg.netShareWeight > 0)
+                update_nets_by_tile(other_cell, ctx->getBelLocation(oldBel), ctx->getBelLocation(newBel));
+        }
+        if (cfg.netShareWeight > 0)
+            update_nets_by_tile(cell, ctx->getBelLocation(newBel), ctx->getBelLocation(oldBel));
+        return false;
+    }
+
+    // Swap the Bel of a cell with another, return the original location
+    BelId swap_cell_bels(CellInfo *cell, BelId newBel)
+    {
+        BelId oldBel = cell->bel;
+#if 0
+        log_info("%s old: %s new: %s\n", cell->name.c_str(ctx), ctx->nameOfBel(cell->bel), ctx->nameOfBel(newBel));
+#endif
+        CellInfo *bound = ctx->getBoundBelCell(newBel);
+        if (bound != nullptr)
+            ctx->unbindBel(newBel);
+        ctx->unbindBel(oldBel);
+        ctx->bindBel(newBel, cell, (cell->cluster != ClusterId()) ? STRENGTH_STRONG : STRENGTH_WEAK);
+        if (bound != nullptr) {
+            ctx->bindBel(oldBel, bound, (bound->cluster != ClusterId()) ? STRENGTH_STRONG : STRENGTH_WEAK);
+            if (cfg.netShareWeight > 0)
+                update_nets_by_tile(bound, ctx->getBelLocation(newBel), ctx->getBelLocation(oldBel));
+        }
+        if (cfg.netShareWeight > 0)
+            update_nets_by_tile(cell, ctx->getBelLocation(oldBel), ctx->getBelLocation(newBel));
+        return oldBel;
+    }
+
+    // Attempt to swap a chain with a non-chain
+    bool try_swap_chain(CellInfo *cell, BelId newBase)
+    {
+        std::vector<std::pair<CellInfo *, Loc>> cell_rel;
+        dict<IdString, BelId> moved_cells;
+        double delta = 0;
+        int orig_share_cost = total_net_share;
+        moveChange.reset(this);
+#if CHAIN_DEBUG
+        log_info("finding cells for chain swap %s\n", cell->name.c_str(ctx));
+#endif
+        std::queue<std::pair<ClusterId, BelId>> displaced_clusters;
+        displaced_clusters.emplace(cell->cluster, newBase);
+        while (!displaced_clusters.empty()) {
+            std::vector<std::pair<CellInfo *, BelId>> dest_bels;
+            auto cursor = displaced_clusters.front();
+#if CHAIN_DEBUG
+            log_info("%d Cluster %s\n", __LINE__, cursor.first.c_str(ctx));
+#endif
+            displaced_clusters.pop();
+            if (!ctx->getClusterPlacement(cursor.first, cursor.second, dest_bels))
+                goto swap_fail;
+            for (const auto &db : dest_bels) {
+                // Ensure the cluster is ripped up
+                if (db.first->bel != BelId()) {
+                    moved_cells[db.first->name] = db.first->bel;
+#if CHAIN_DEBUG
+                    log_info("%d unbind %s\n", __LINE__, ctx->nameOfBel(db.first->bel));
+#endif
+                    ctx->unbindBel(db.first->bel);
+                }
+            }
+            for (const auto &db : dest_bels) {
+                CellInfo *bound = ctx->getBoundBelCell(db.second);
+                BelId old_bel = moved_cells.at(db.first->name);
+                if (!ctx->checkBelAvail(old_bel) && bound != nullptr) {
+                    // Simple swap no longer possible
+                    goto swap_fail;
+                }
+                if (bound != nullptr) {
+                    if (moved_cells.count(bound->name)) {
+                        // Don't move a cell multiple times in the same go
+                        goto swap_fail;
+                    } else if (bound->belStrength > STRENGTH_STRONG) {
+                        goto swap_fail;
+                    } else if (bound->cluster != ClusterId()) {
+                        // Displace the entire cluster
+                        Loc old_loc = ctx->getBelLocation(old_bel);
+                        Loc bound_loc = ctx->getBelLocation(bound->bel);
+                        Loc root_loc = ctx->getBelLocation(ctx->getClusterRootCell(bound->cluster)->bel);
+                        BelId new_root = ctx->getBelByLocation(Loc(old_loc.x + (root_loc.x - bound_loc.x),
+                                                                   old_loc.y + (root_loc.y - bound_loc.y),
+                                                                   old_loc.z + (root_loc.z - bound_loc.z)));
+                        if (new_root == BelId())
+                            goto swap_fail;
+                        for (auto cluster_cell : cluster2cell.at(bound->cluster)) {
+                            moved_cells[cluster_cell->name] = cluster_cell->bel;
+#if CHAIN_DEBUG
+                            log_info("%d unbind %s\n", __LINE__, ctx->nameOfBel(cluster_cell->bel));
+#endif
+                            ctx->unbindBel(cluster_cell->bel);
+                        }
+                        displaced_clusters.emplace(bound->cluster, new_root);
+                    } else {
+                        // Just a single cell to move
+                        moved_cells[bound->name] = bound->bel;
+#if CHAIN_DEBUG
+                        log_info("%d unbind %s\n", __LINE__, ctx->nameOfBel(bound->bel));
+                        log_info("%d bind %s %s\n", __LINE__, ctx->nameOfBel(old_bel), ctx->nameOf(bound));
+#endif
+                        ctx->unbindBel(bound->bel);
+                        ctx->bindBel(old_bel, bound, STRENGTH_WEAK);
+                    }
+                } else if (!ctx->checkBelAvail(db.second)) {
+                    goto swap_fail;
+                }
+                // All those shenanigans should now mean the target bel is free to use
+#if CHAIN_DEBUG
+                log_info("%d bind %s %s\n", __LINE__, ctx->nameOfBel(db.second), ctx->nameOf(db.first));
+#endif
+                ctx->bindBel(db.second, db.first, STRENGTH_WEAK);
+            }
+        }
+
+        for (const auto &mm : moved_cells) {
+            CellInfo *cell = ctx->cells.at(mm.first).get();
+            add_move_cell(moveChange, cell, moved_cells.at(cell->name));
+            if (cfg.netShareWeight > 0)
+                update_nets_by_tile(cell, ctx->getBelLocation(moved_cells.at(cell->name)),
+                                    ctx->getBelLocation(cell->bel));
+            if (!ctx->isBelLocationValid(cell->bel) || !cell->testRegion(cell->bel))
+                goto swap_fail;
+        }
+#if CHAIN_DEBUG
+        log_info("legal chain swap %s\n", cell->name.c_str(ctx));
+#endif
+        compute_cost_changes(moveChange);
+        delta = lambda * (moveChange.timing_delta / last_timing_cost) +
+                (1 - lambda) * (double(moveChange.wirelen_delta) / last_wirelen_cost);
+        if (cfg.netShareWeight > 0) {
+            delta +=
+                    cfg.netShareWeight * (orig_share_cost - total_net_share) / std::max<double>(total_net_share, 1e-20);
+        }
+        n_move++;
+        // SA acceptance criteria
+        if (delta < 0 || (temp > 1e-8 && (ctx->rng() / float(0x3fffffff)) <= std::exp(-delta / temp))) {
+            n_accept++;
+#if CHAIN_DEBUG
+            log_info("accepted chain swap %s\n", cell->name.c_str(ctx));
+#endif
+        } else {
+            goto swap_fail;
+        }
+        commit_cost_changes(moveChange);
+        return true;
+    swap_fail:
+#if CHAIN_DEBUG
+        log_info("Swap failed\n");
+#endif
+        for (auto cell_pair : moved_cells) {
+            CellInfo *cell = ctx->cells.at(cell_pair.first).get();
+            if (cell->bel != BelId()) {
+#if CHAIN_DEBUG
+                log_info("%d unbind %s\n", __LINE__, ctx->nameOfBel(cell->bel));
+#endif
+                ctx->unbindBel(cell->bel);
+            }
+        }
+        for (auto cell_pair : moved_cells) {
+            CellInfo *cell = ctx->cells.at(cell_pair.first).get();
+#if CHAIN_DEBUG
+            log_info("%d bind %s %s\n", __LINE__, ctx->nameOfBel(cell_pair.second), cell->name.c_str(ctx));
+#endif
+            ctx->bindBel(cell_pair.second, cell, STRENGTH_WEAK);
+        }
+        return false;
+    }
+
+    // Find a random Bel of the correct type for a cell, within the specified
+    // diameter
+    BelId random_bel_for_cell(CellInfo *cell, int force_z = -1)
+    {
+        IdString targetType = cell->type;
+        Loc curr_loc = ctx->getBelLocation(cell->bel);
+        int count = 0;
+
+        int dx = diameter, dy = diameter;
+        if (cell->region != nullptr && cell->region->constr_bels) {
+            dx = std::min(cfg.hpwl_scale_x * diameter,
+                          (region_bounds[cell->region->name].x1 - region_bounds[cell->region->name].x0) + 1);
+            dy = std::min(cfg.hpwl_scale_y * diameter,
+                          (region_bounds[cell->region->name].y1 - region_bounds[cell->region->name].y0) + 1);
+            // Clamp location to within bounds
+            curr_loc.x = std::max(region_bounds[cell->region->name].x0, curr_loc.x);
+            curr_loc.x = std::min(region_bounds[cell->region->name].x1, curr_loc.x);
+            curr_loc.y = std::max(region_bounds[cell->region->name].y0, curr_loc.y);
+            curr_loc.y = std::min(region_bounds[cell->region->name].y1, curr_loc.y);
+        }
+
+        FastBels::FastBelsData *bel_data;
+        auto type_cnt = fast_bels.getBelsForCellType(targetType, &bel_data);
+
+        while (true) {
+            int nx = ctx->rng(2 * dx + 1) + std::max(curr_loc.x - dx, 0);
+            int ny = ctx->rng(2 * dy + 1) + std::max(curr_loc.y - dy, 0);
+            if (cfg.minBelsForGridPick >= 0 && type_cnt < cfg.minBelsForGridPick)
+                nx = ny = 0;
+            if (nx >= int(bel_data->size()))
+                continue;
+            if (ny >= int(bel_data->at(nx).size()))
+                continue;
+            const auto &fb = bel_data->at(nx).at(ny);
+            if (fb.size() == 0)
+                continue;
+            BelId bel = fb.at(ctx->rng(int(fb.size())));
+            if (force_z != -1) {
+                Loc loc = ctx->getBelLocation(bel);
+                if (loc.z != force_z)
+                    continue;
+            }
+            if (!cell->testRegion(bel))
+                continue;
+            if (locked_bels.find(bel) != locked_bels.end())
+                continue;
+            count++;
+            return bel;
+        }
+    }
+
+    // Return true if a net is to be entirely ignored
+    inline bool ignore_net(NetInfo *net)
+    {
+        return net->driver.cell == nullptr || net->driver.cell->bel == BelId() ||
+               ctx->getBelGlobalBuf(net->driver.cell->bel);
+    }
+
+    // Get the bounding box for a net
+    inline BoundingBox get_net_bounds(NetInfo *net)
+    {
+        BoundingBox bb;
+        NPNR_ASSERT(net->driver.cell != nullptr);
+        Loc dloc = ctx->getBelLocation(net->driver.cell->bel);
+        bb.x0 = dloc.x;
+        bb.x1 = dloc.x;
+        bb.y0 = dloc.y;
+        bb.y1 = dloc.y;
+        bb.nx0 = 1;
+        bb.nx1 = 1;
+        bb.ny0 = 1;
+        bb.ny1 = 1;
+        for (auto user : net->users) {
+            if (user.cell->bel == BelId())
+                continue;
+            Loc uloc = ctx->getBelLocation(user.cell->bel);
+            if (bb.x0 == uloc.x)
+                ++bb.nx0;
+            else if (uloc.x < bb.x0) {
+                bb.x0 = uloc.x;
+                bb.nx0 = 1;
+            }
+            if (bb.x1 == uloc.x)
+                ++bb.nx1;
+            else if (uloc.x > bb.x1) {
+                bb.x1 = uloc.x;
+                bb.nx1 = 1;
+            }
+            if (bb.y0 == uloc.y)
+                ++bb.ny0;
+            else if (uloc.y < bb.y0) {
+                bb.y0 = uloc.y;
+                bb.ny0 = 1;
+            }
+            if (bb.y1 == uloc.y)
+                ++bb.ny1;
+            else if (uloc.y > bb.y1) {
+                bb.y1 = uloc.y;
+                bb.ny1 = 1;
+            }
+        }
+
+        return bb;
+    }
+
+    // Get the timing cost for an arc of a net
+    inline double get_timing_cost(NetInfo *net, const PortRef &user)
+    {
+        int cc;
+        if (net->driver.cell == nullptr)
+            return 0;
+        if (ctx->getPortTimingClass(net->driver.cell, net->driver.port, cc) == TMG_IGNORE)
+            return 0;
+        if (cfg.budgetBased) {
+            double delay = ctx->getDelayNS(ctx->predictArcDelay(net, user));
+            return std::min(10.0, std::exp(delay - ctx->getDelayNS(user.budget) / 10));
+        } else {
+            float crit = tmg.get_criticality(CellPortKey(user));
+            double delay = ctx->getDelayNS(ctx->predictArcDelay(net, user));
+            return delay * std::pow(crit, crit_exp);
+        }
+    }
+
+    // Set up the cost maps
+    void setup_costs()
+    {
+        for (auto &net : ctx->nets) {
+            NetInfo *ni = net.second.get();
+            if (ignore_net(ni))
+                continue;
+            net_bounds[ni->udata] = get_net_bounds(ni);
+            if (cfg.timing_driven && int(ni->users.entries()) < cfg.timingFanoutThresh)
+                for (auto usr : ni->users.enumerate())
+                    net_arc_tcost[ni->udata][usr.index.idx()] = get_timing_cost(ni, usr.value);
+        }
+    }
+
+    // Get the total wiring cost for the design
+    wirelen_t total_wirelen_cost()
+    {
+        wirelen_t cost = 0;
+        for (const auto &net : net_bounds)
+            cost += net.hpwl(cfg);
+        return cost;
+    }
+
+    // Get the total timing cost for the design
+    double total_timing_cost()
+    {
+        double cost = 0;
+        for (const auto &net : net_arc_tcost) {
+            for (auto arc_cost : net) {
+                cost += arc_cost;
+            }
+        }
+        return cost;
+    }
+
+    // Cost-change-related data for a move
+    struct MoveChangeData
+    {
+
+        enum BoundChangeType
+        {
+            NO_CHANGE,
+            CELL_MOVED_INWARDS,
+            CELL_MOVED_OUTWARDS,
+            FULL_RECOMPUTE
+        };
+
+        std::vector<decltype(NetInfo::udata)> bounds_changed_nets_x, bounds_changed_nets_y;
+        std::vector<std::pair<decltype(NetInfo::udata), store_index<PortRef>>> changed_arcs;
+
+        std::vector<BoundChangeType> already_bounds_changed_x, already_bounds_changed_y;
+        std::vector<std::vector<bool>> already_changed_arcs;
+
+        std::vector<BoundingBox> new_net_bounds;
+        std::vector<std::pair<std::pair<decltype(NetInfo::udata), store_index<PortRef>>, double>> new_arc_costs;
+
+        wirelen_t wirelen_delta = 0;
+        double timing_delta = 0;
+
+        void init(SAPlacer *p)
+        {
+            already_bounds_changed_x.resize(p->ctx->nets.size());
+            already_bounds_changed_y.resize(p->ctx->nets.size());
+            already_changed_arcs.resize(p->ctx->nets.size());
+            for (auto &net : p->ctx->nets) {
+                already_changed_arcs.at(net.second->udata).resize(net.second->users.capacity());
+            }
+            new_net_bounds = p->net_bounds;
+        }
+
+        void reset(SAPlacer *p)
+        {
+            for (auto bc : bounds_changed_nets_x) {
+                new_net_bounds[bc] = p->net_bounds[bc];
+                already_bounds_changed_x[bc] = NO_CHANGE;
+            }
+            for (auto bc : bounds_changed_nets_y) {
+                new_net_bounds[bc] = p->net_bounds[bc];
+                already_bounds_changed_y[bc] = NO_CHANGE;
+            }
+            for (const auto &tc : changed_arcs)
+                already_changed_arcs[tc.first][tc.second.idx()] = false;
+            bounds_changed_nets_x.clear();
+            bounds_changed_nets_y.clear();
+            changed_arcs.clear();
+            new_arc_costs.clear();
+            wirelen_delta = 0;
+            timing_delta = 0;
+        }
+
+    } moveChange;
+
+    void add_move_cell(MoveChangeData &mc, CellInfo *cell, BelId old_bel)
+    {
+        Loc curr_loc = ctx->getBelLocation(cell->bel);
+        Loc old_loc = ctx->getBelLocation(old_bel);
+        // Check net bounds
+        for (const auto &port : cell->ports) {
+            NetInfo *pn = port.second.net;
+            if (pn == nullptr)
+                continue;
+            if (ignore_net(pn))
+                continue;
+            BoundingBox &curr_bounds = mc.new_net_bounds[pn->udata];
+            // Incremental bounding box updates
+            // Note that everything other than full updates are applied immediately rather than being queued,
+            // so further updates to the same net in the same move are dealt with correctly.
+            // If a full update is already queued, this can be considered a no-op
+            if (mc.already_bounds_changed_x[pn->udata] != MoveChangeData::FULL_RECOMPUTE) {
+                // Bounds x0
+                if (curr_loc.x < curr_bounds.x0) {
+                    // Further out than current bounds x0
+                    curr_bounds.x0 = curr_loc.x;
+                    curr_bounds.nx0 = 1;
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        // Checking already_bounds_changed_x ensures that each net is only added once
+                        // to bounds_changed_nets, lest we add its HPWL change multiple times skewing the
+                        // overall cost change
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    }
+                } else if (curr_loc.x == curr_bounds.x0 && old_loc.x > curr_bounds.x0) {
+                    curr_bounds.nx0++;
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    }
+                } else if (old_loc.x == curr_bounds.x0 && curr_loc.x > curr_bounds.x0) {
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE)
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    if (curr_bounds.nx0 == 1) {
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::FULL_RECOMPUTE;
+                    } else {
+                        curr_bounds.nx0--;
+                        if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE)
+                            mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_INWARDS;
+                    }
+                }
+
+                // Bounds x1
+                if (curr_loc.x > curr_bounds.x1) {
+                    // Further out than current bounds x1
+                    curr_bounds.x1 = curr_loc.x;
+                    curr_bounds.nx1 = 1;
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        // Checking already_bounds_changed_x ensures that each net is only added once
+                        // to bounds_changed_nets, lest we add its HPWL change multiple times skewing the
+                        // overall cost change
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    }
+                } else if (curr_loc.x == curr_bounds.x1 && old_loc.x < curr_bounds.x1) {
+                    curr_bounds.nx1++;
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    }
+                } else if (old_loc.x == curr_bounds.x1 && curr_loc.x < curr_bounds.x1) {
+                    if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE)
+                        mc.bounds_changed_nets_x.push_back(pn->udata);
+                    if (curr_bounds.nx1 == 1) {
+                        mc.already_bounds_changed_x[pn->udata] = MoveChangeData::FULL_RECOMPUTE;
+                    } else {
+                        curr_bounds.nx1--;
+                        if (mc.already_bounds_changed_x[pn->udata] == MoveChangeData::NO_CHANGE)
+                            mc.already_bounds_changed_x[pn->udata] = MoveChangeData::CELL_MOVED_INWARDS;
+                    }
+                }
+            }
+            if (mc.already_bounds_changed_y[pn->udata] != MoveChangeData::FULL_RECOMPUTE) {
+                // Bounds y0
+                if (curr_loc.y < curr_bounds.y0) {
+                    // Further out than current bounds y0
+                    curr_bounds.y0 = curr_loc.y;
+                    curr_bounds.ny0 = 1;
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    }
+                } else if (curr_loc.y == curr_bounds.y0 && old_loc.y > curr_bounds.y0) {
+                    curr_bounds.ny0++;
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    }
+                } else if (old_loc.y == curr_bounds.y0 && curr_loc.y > curr_bounds.y0) {
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE)
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    if (curr_bounds.ny0 == 1) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::FULL_RECOMPUTE;
+                    } else {
+                        curr_bounds.ny0--;
+                        if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE)
+                            mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_INWARDS;
+                    }
+                }
+
+                // Bounds y1
+                if (curr_loc.y > curr_bounds.y1) {
+                    // Further out than current bounds y1
+                    curr_bounds.y1 = curr_loc.y;
+                    curr_bounds.ny1 = 1;
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    }
+                } else if (curr_loc.y == curr_bounds.y1 && old_loc.y < curr_bounds.y1) {
+                    curr_bounds.ny1++;
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_OUTWARDS;
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    }
+                } else if (old_loc.y == curr_bounds.y1 && curr_loc.y < curr_bounds.y1) {
+                    if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE)
+                        mc.bounds_changed_nets_y.push_back(pn->udata);
+                    if (curr_bounds.ny1 == 1) {
+                        mc.already_bounds_changed_y[pn->udata] = MoveChangeData::FULL_RECOMPUTE;
+                    } else {
+                        curr_bounds.ny1--;
+                        if (mc.already_bounds_changed_y[pn->udata] == MoveChangeData::NO_CHANGE)
+                            mc.already_bounds_changed_y[pn->udata] = MoveChangeData::CELL_MOVED_INWARDS;
+                    }
+                }
+            }
+
+            if (cfg.timing_driven && int(pn->users.entries()) < cfg.timingFanoutThresh) {
+                // Output ports - all arcs change timing
+                if (port.second.type == PORT_OUT) {
+                    int cc;
+                    TimingPortClass cls = ctx->getPortTimingClass(cell, port.first, cc);
+                    if (cls != TMG_IGNORE)
+                        for (auto usr : pn->users.enumerate())
+                            if (!mc.already_changed_arcs[pn->udata][usr.index.idx()]) {
+                                mc.changed_arcs.emplace_back(std::make_pair(pn->udata, usr.index));
+                                mc.already_changed_arcs[pn->udata][usr.index.idx()] = true;
+                            }
+                } else if (port.second.type == PORT_IN) {
+                    auto usr_idx = port.second.user_idx;
+                    if (!mc.already_changed_arcs[pn->udata][usr_idx.idx()]) {
+                        mc.changed_arcs.emplace_back(std::make_pair(pn->udata, usr_idx));
+                        mc.already_changed_arcs[pn->udata][usr_idx.idx()] = true;
+                    }
+                }
+            }
+        }
+    }
+
+    void compute_cost_changes(MoveChangeData &md)
+    {
+        for (const auto &bc : md.bounds_changed_nets_x) {
+            if (md.already_bounds_changed_x[bc] == MoveChangeData::FULL_RECOMPUTE)
+                md.new_net_bounds[bc] = get_net_bounds(net_by_udata[bc]);
+        }
+        for (const auto &bc : md.bounds_changed_nets_y) {
+            if (md.already_bounds_changed_x[bc] != MoveChangeData::FULL_RECOMPUTE &&
+                md.already_bounds_changed_y[bc] == MoveChangeData::FULL_RECOMPUTE)
+                md.new_net_bounds[bc] = get_net_bounds(net_by_udata[bc]);
+        }
+
+        for (const auto &bc : md.bounds_changed_nets_x)
+            md.wirelen_delta += md.new_net_bounds[bc].hpwl(cfg) - net_bounds[bc].hpwl(cfg);
+        for (const auto &bc : md.bounds_changed_nets_y)
+            if (md.already_bounds_changed_x[bc] == MoveChangeData::NO_CHANGE)
+                md.wirelen_delta += md.new_net_bounds[bc].hpwl(cfg) - net_bounds[bc].hpwl(cfg);
+
+        if (cfg.timing_driven) {
+            for (const auto &tc : md.changed_arcs) {
+                double old_cost = net_arc_tcost.at(tc.first).at(tc.second.idx());
+                double new_cost =
+                        get_timing_cost(net_by_udata.at(tc.first), net_by_udata.at(tc.first)->users.at(tc.second));
+                md.new_arc_costs.emplace_back(std::make_pair(tc, new_cost));
+                md.timing_delta += (new_cost - old_cost);
+                md.already_changed_arcs[tc.first][tc.second.idx()] = false;
+            }
+        }
+    }
+
+    void commit_cost_changes(MoveChangeData &md)
+    {
+        for (const auto &bc : md.bounds_changed_nets_x)
+            net_bounds[bc] = md.new_net_bounds[bc];
+        for (const auto &bc : md.bounds_changed_nets_y)
+            net_bounds[bc] = md.new_net_bounds[bc];
+        for (const auto &tc : md.new_arc_costs)
+            net_arc_tcost[tc.first.first].at(tc.first.second.idx()) = tc.second;
+        curr_wirelen_cost += md.wirelen_delta;
+        curr_timing_cost += md.timing_delta;
+    }
+
+    // Simple routeability driven placement
+    const int large_cell_thresh = 50;
+    int total_net_share = 0;
+    std::vector<std::vector<dict<IdString, int>>> nets_by_tile;
+    void setup_nets_by_tile()
+    {
+        total_net_share = 0;
+        nets_by_tile.resize(max_x + 1, std::vector<dict<IdString, int>>(max_y + 1));
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            if (int(ci->ports.size()) > large_cell_thresh)
+                continue;
+            Loc loc = ctx->getBelLocation(ci->bel);
+            auto &nbt = nets_by_tile.at(loc.x).at(loc.y);
+            for (const auto &port : ci->ports) {
+                if (port.second.net == nullptr)
+                    continue;
+                if (port.second.net->driver.cell == nullptr || ctx->getBelGlobalBuf(port.second.net->driver.cell->bel))
+                    continue;
+                int &s = nbt[port.second.net->name];
+                if (s > 0)
+                    ++total_net_share;
+                ++s;
+            }
+        }
+    }
+
+    int update_nets_by_tile(CellInfo *ci, Loc old_loc, Loc new_loc)
+    {
+        if (int(ci->ports.size()) > large_cell_thresh)
+            return 0;
+        int loss = 0, gain = 0;
+        auto &nbt_old = nets_by_tile.at(old_loc.x).at(old_loc.y);
+        auto &nbt_new = nets_by_tile.at(new_loc.x).at(new_loc.y);
+
+        for (const auto &port : ci->ports) {
+            if (port.second.net == nullptr)
+                continue;
+            if (port.second.net->driver.cell == nullptr || ctx->getBelGlobalBuf(port.second.net->driver.cell->bel))
+                continue;
+            int &o = nbt_old[port.second.net->name];
+            --o;
+            NPNR_ASSERT(o >= 0);
+            if (o > 0)
+                ++loss;
+            int &n = nbt_new[port.second.net->name];
+            if (n > 0)
+                ++gain;
+            ++n;
+        }
+        int delta = gain - loss;
+        total_net_share += delta;
+        return delta;
+    }
+
+    // Get the combined wirelen/timing metric
+    inline double curr_metric()
+    {
+        return lambda * curr_timing_cost + (1 - lambda) * curr_wirelen_cost - cfg.netShareWeight * total_net_share;
+    }
+
+    // Map nets to their bounding box (so we can skip recompute for moves that do not exceed the bounds
+    std::vector<BoundingBox> net_bounds;
+    // Map net arcs to their timing cost (criticality * delay ns)
+    std::vector<std::vector<double>> net_arc_tcost;
+
+    // Fast lookup for cell to clusters
+    dict<ClusterId, std::vector<CellInfo *>> cluster2cell;
+
+    // Wirelength and timing cost at last and current iteration
+    wirelen_t last_wirelen_cost, curr_wirelen_cost;
+    double last_timing_cost, curr_timing_cost;
+
+    Context *ctx;
+    float temp = 10;
+    float crit_exp = 8;
+    float lambda = 0.5;
+    bool improved = false;
+    int n_move, n_accept;
+    int diameter = 35, max_x = 1, max_y = 1;
+    dict<IdString, std::tuple<int, int>> bel_types;
+    dict<IdString, BoundingBox> region_bounds;
+    FastBels fast_bels;
+    pool<BelId> locked_bels;
+    std::vector<NetInfo *> net_by_udata;
+    std::vector<decltype(NetInfo::udata)> old_udata;
+    bool require_legal = true;
+    const int legalise_dia = 4;
+    Placer1Cfg cfg;
+
+    TimingAnalyser tmg;
+};
+
+Placer1Cfg::Placer1Cfg(Context *ctx)
+{
+    constraintWeight = ctx->setting<float>("placer1/constraintWeight", 10);
+    netShareWeight = ctx->setting<float>("placer1/netShareWeight", 0);
+    minBelsForGridPick = ctx->setting<int>("placer1/minBelsForGridPick", 64);
+    budgetBased = ctx->setting<bool>("placer1/budgetBased", false);
+    startTemp = ctx->setting<float>("placer1/startTemp", 1);
+    timingFanoutThresh = std::numeric_limits<int>::max();
+    timing_driven = ctx->setting<bool>("timing_driven");
+    slack_redist_iter = ctx->setting<int>("slack_redist_iter");
+    hpwl_scale_x = 1;
+    hpwl_scale_y = 1;
+}
+
+bool placer1(Context *ctx, Placer1Cfg cfg)
+{
+    try {
+        SAPlacer placer(ctx, cfg);
+        placer.place();
+        log_info("Checksum: 0x%08x\n", ctx->checksum());
+#ifndef NDEBUG
+        ctx->lock();
+        ctx->check();
+        ctx->unlock();
+#endif
+        return true;
+    } catch (log_execution_error_exception) {
+#ifndef NDEBUG
+        ctx->lock();
+        ctx->check();
+        ctx->unlock();
+#endif
+        return false;
+    }
+}
+
+bool placer1_refine(Context *ctx, Placer1Cfg cfg)
+{
+    try {
+        SAPlacer placer(ctx, cfg);
+        placer.place(true);
+        log_info("Checksum: 0x%08x\n", ctx->checksum());
+#ifndef NDEBUG
+        ctx->lock();
+        ctx->check();
+        ctx->unlock();
+#endif
+        return true;
+    } catch (log_execution_error_exception) {
+#ifndef NDEBUG
+        ctx->lock();
+        ctx->check();
+        ctx->unlock();
+#endif
+        return false;
+    }
+}
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/placer1.h b/common/place/placer1.h
new file mode 100644
index 00000000..9dfb0b0d
--- /dev/null
+++ b/common/place/placer1.h
@@ -0,0 +1,45 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  Claire Xenia Wolf <claire@yosyshq.com>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+#ifndef PLACE_H
+#define PLACE_H
+
+#include "log.h"
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct Placer1Cfg
+{
+    Placer1Cfg(Context *ctx);
+    float constraintWeight, netShareWeight;
+    int minBelsForGridPick;
+    bool budgetBased;
+    float startTemp;
+    int timingFanoutThresh;
+    bool timing_driven;
+    int slack_redist_iter;
+    int hpwl_scale_x, hpwl_scale_y;
+};
+
+extern bool placer1(Context *ctx, Placer1Cfg cfg);
+extern bool placer1_refine(Context *ctx, Placer1Cfg cfg);
+
+NEXTPNR_NAMESPACE_END
+
+#endif // PLACE_H
diff --git a/common/place/placer_heap.cc b/common/place/placer_heap.cc
new file mode 100644
index 00000000..4c9ffb23
--- /dev/null
+++ b/common/place/placer_heap.cc
@@ -0,0 +1,1830 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2019  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ *  [[cite]] HeAP
+ *  Analytical Placement for Heterogeneous FPGAs, Marcel Gort and Jason H. Anderson
+ *  https://janders.eecg.utoronto.ca/pdfs/marcelfpl12.pdf
+ *
+ *  [[cite]] SimPL
+ *  SimPL: An Effective Placement Algorithm, Myung-Chul Kim, Dong-Jin Lee and Igor L. Markov
+ *  http://www.ece.umich.edu/cse/awards/pdfs/iccad10-simpl.pdf
+ *
+ *  Notable changes from the original algorithm
+ *   - Following the other nextpnr placer, Bels are placed rather than CLBs. This means a strict legalisation pass is
+ *     added in addition to coarse legalisation (referred to as "spreading" to avoid confusion with strict legalisation)
+ *     as described in HeAP to ensure validity. This searches random bels in the vicinity of the position chosen by
+ *     spreading, with diameter increasing over iterations, with a heuristic to prefer lower wirelength choices.
+ *   - To make the placer timing-driven, the bound2bound weights are multiplied by (1 + 10 * crit^2)
+ */
+
+#ifdef WITH_HEAP
+
+#include "placer_heap.h"
+#include <Eigen/Core>
+#include <Eigen/IterativeLinearSolvers>
+#include <boost/optional.hpp>
+#include <chrono>
+#include <deque>
+#include <fstream>
+#include <numeric>
+#include <queue>
+#include <tuple>
+#include "fast_bels.h"
+#include "log.h"
+#include "nextpnr.h"
+#include "parallel_refine.h"
+#include "place_common.h"
+#include "placer1.h"
+#include "scope_lock.h"
+#include "timing.h"
+#include "util.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+namespace {
+// A simple internal representation for a sparse system of equations Ax = rhs
+// This is designed to decouple the functions that build the matrix to the engine that
+// solves it, and the representation that requires
+template <typename T> struct EquationSystem
+{
+
+    EquationSystem(size_t rows, size_t cols)
+    {
+        A.resize(cols);
+        rhs.resize(rows);
+    }
+
+    // Simple sparse format, easy to convert to CCS for solver
+    std::vector<std::vector<std::pair<int, T>>> A; // col -> (row, x[row, col]) sorted by row
+    std::vector<T> rhs;                            // RHS vector
+    void reset()
+    {
+        for (auto &col : A)
+            col.clear();
+        std::fill(rhs.begin(), rhs.end(), T());
+    }
+
+    void add_coeff(int row, int col, T val)
+    {
+        auto &Ac = A.at(col);
+        // Binary search
+        int b = 0, e = int(Ac.size()) - 1;
+        while (b <= e) {
+            int i = (b + e) / 2;
+            if (Ac.at(i).first == row) {
+                Ac.at(i).second += val;
+                return;
+            }
+            if (Ac.at(i).first > row)
+                e = i - 1;
+            else
+                b = i + 1;
+        }
+        Ac.insert(Ac.begin() + b, std::make_pair(row, val));
+    }
+
+    void add_rhs(int row, T val) { rhs[row] += val; }
+
+    void solve(std::vector<T> &x, float tolerance)
+    {
+        using namespace Eigen;
+        if (x.empty())
+            return;
+        NPNR_ASSERT(x.size() == A.size());
+
+        VectorXd vx(x.size()), vb(rhs.size());
+        SparseMatrix<T> mat(A.size(), A.size());
+
+        std::vector<int> colnnz;
+        for (auto &Ac : A)
+            colnnz.push_back(int(Ac.size()));
+        mat.reserve(colnnz);
+        for (int col = 0; col < int(A.size()); col++) {
+            auto &Ac = A.at(col);
+            for (auto &el : Ac)
+                mat.insert(el.first, col) = el.second;
+        }
+
+        for (int i = 0; i < int(x.size()); i++)
+            vx[i] = x.at(i);
+        for (int i = 0; i < int(rhs.size()); i++)
+            vb[i] = rhs.at(i);
+
+        ConjugateGradient<SparseMatrix<T>, Lower | Upper> solver;
+        solver.setTolerance(tolerance);
+        VectorXd xr = solver.compute(mat).solveWithGuess(vb, vx);
+        for (int i = 0; i < int(x.size()); i++)
+            x.at(i) = xr[i];
+        // for (int i = 0; i < int(x.size()); i++)
+        //    log_info("x[%d] = %f\n", i, x.at(i));
+    }
+};
+
+} // namespace
+
+class HeAPPlacer
+{
+  public:
+    HeAPPlacer(Context *ctx, PlacerHeapCfg cfg)
+            : ctx(ctx), cfg(cfg), fast_bels(ctx, /*check_bel_available=*/true, -1), tmg(ctx)
+    {
+        Eigen::initParallel();
+        tmg.setup_only = true;
+        tmg.setup();
+
+        for (auto &cell : ctx->cells)
+            if (cell.second->cluster != ClusterId())
+                cluster2cells[cell.second->cluster].push_back(cell.second.get());
+    }
+
+    bool place()
+    {
+        auto startt = std::chrono::high_resolution_clock::now();
+
+        ScopeLock<Context> lock(ctx);
+        place_constraints();
+        build_fast_bels();
+        seed_placement();
+        update_all_chains();
+        wirelen_t hpwl = total_hpwl();
+        log_info("Creating initial analytic placement for %d cells, random placement wirelen = %d.\n",
+                 int(place_cells.size()), int(hpwl));
+        for (int i = 0; i < 4; i++) {
+            setup_solve_cells();
+            auto solve_startt = std::chrono::high_resolution_clock::now();
+#ifdef NPNR_DISABLE_THREADS
+            build_solve_direction(false, -1);
+            build_solve_direction(true, -1);
+#else
+            boost::thread xaxis([&]() { build_solve_direction(false, -1); });
+            build_solve_direction(true, -1);
+            xaxis.join();
+#endif
+            auto solve_endt = std::chrono::high_resolution_clock::now();
+            solve_time += std::chrono::duration<double>(solve_endt - solve_startt).count();
+
+            update_all_chains();
+
+            hpwl = total_hpwl();
+            log_info("    at initial placer iter %d, wirelen = %d\n", i, int(hpwl));
+        }
+
+        wirelen_t solved_hpwl = 0, spread_hpwl = 0, legal_hpwl = 0, best_hpwl = std::numeric_limits<wirelen_t>::max();
+        int iter = 0, stalled = 0;
+
+        std::vector<std::tuple<CellInfo *, BelId, PlaceStrength>> solution;
+
+        std::vector<pool<BelBucketId>> heap_runs;
+        pool<BelBucketId> all_buckets;
+        dict<BelBucketId, int> bucket_count;
+
+        for (auto cell : place_cells) {
+            BelBucketId bucket = ctx->getBelBucketForCellType(cell->type);
+            if (!all_buckets.count(bucket)) {
+                heap_runs.push_back(pool<BelBucketId>{bucket});
+                all_buckets.insert(bucket);
+            }
+            bucket_count[bucket]++;
+        }
+        // If more than 98% of cells are one cell type, always solve all at once
+        // Otherwise, follow full HeAP strategy of rotate&all
+        for (auto &c : bucket_count) {
+            if (c.second >= 0.98 * int(place_cells.size())) {
+                heap_runs.clear();
+                break;
+            }
+        }
+
+        if (cfg.placeAllAtOnce) {
+            // Never want to deal with LUTs, FFs, MUXFxs separately,
+            // for now disable all single-cell-type runs and only have heterogeneous
+            // runs
+            heap_runs.clear();
+        }
+
+        heap_runs.push_back(all_buckets);
+        // The main HeAP placer loop
+        log_info("Running main analytical placer.\n");
+        while (stalled < 5 && (solved_hpwl <= legal_hpwl * 0.8)) {
+            // Alternate between particular bel types and all bels
+            for (auto &run : heap_runs) {
+                auto run_startt = std::chrono::high_resolution_clock::now();
+
+                setup_solve_cells(&run);
+                if (solve_cells.empty())
+                    continue;
+                // Heuristic: don't bother with threading below a certain size
+                auto solve_startt = std::chrono::high_resolution_clock::now();
+
+                // Build the connectivity matrix and run the solver; multithreaded between x and y axes if applicable
+#ifndef NPNR_DISABLE_THREADS
+                if (solve_cells.size() >= 500) {
+                    boost::thread xaxis([&]() { build_solve_direction(false, (iter == 0) ? -1 : iter); });
+                    build_solve_direction(true, (iter == 0) ? -1 : iter);
+                    xaxis.join();
+                } else
+#endif
+                {
+                    build_solve_direction(false, (iter == 0) ? -1 : iter);
+                    build_solve_direction(true, (iter == 0) ? -1 : iter);
+                }
+                auto solve_endt = std::chrono::high_resolution_clock::now();
+                solve_time += std::chrono::duration<double>(solve_endt - solve_startt).count();
+                update_all_chains();
+                solved_hpwl = total_hpwl();
+
+                update_all_chains();
+
+                // Run the spreader
+                for (const auto &group : cfg.cellGroups)
+                    CutSpreader(this, group).run();
+
+                for (auto type : run)
+                    if (std::all_of(cfg.cellGroups.begin(), cfg.cellGroups.end(),
+                                    [type](const pool<BelBucketId> &grp) { return !grp.count(type); }))
+                        CutSpreader(this, {type}).run();
+
+                // Run strict legalisation to find a valid bel for all cells
+                update_all_chains();
+                spread_hpwl = total_hpwl();
+                legalise_placement_strict(true);
+                update_all_chains();
+
+                legal_hpwl = total_hpwl();
+                auto run_stopt = std::chrono::high_resolution_clock::now();
+
+                IdString bucket_name = ctx->getBelBucketName(*run.begin());
+                log_info("    at iteration #%d, type %s: wirelen solved = %d, spread = %d, legal = %d; time = %.02fs\n",
+                         iter + 1, (run.size() > 1 ? "ALL" : bucket_name.c_str(ctx)), int(solved_hpwl),
+                         int(spread_hpwl), int(legal_hpwl),
+                         std::chrono::duration<double>(run_stopt - run_startt).count());
+            }
+
+            // Update timing weights
+            if (cfg.timing_driven)
+                tmg.run();
+
+            if (legal_hpwl < best_hpwl) {
+                best_hpwl = legal_hpwl;
+                stalled = 0;
+                // Save solution
+                solution.clear();
+                for (auto &cell : ctx->cells) {
+                    solution.emplace_back(cell.second.get(), cell.second->bel, cell.second->belStrength);
+                }
+            } else {
+                ++stalled;
+            }
+            for (auto &cl : cell_locs) {
+                cl.second.legal_x = cl.second.x;
+                cl.second.legal_y = cl.second.y;
+            }
+            ctx->yield();
+            ++iter;
+        }
+
+        // Apply saved solution
+        for (auto &sc : solution) {
+            CellInfo *cell = std::get<0>(sc);
+            if (cell->bel != BelId())
+                ctx->unbindBel(cell->bel);
+        }
+        for (auto &sc : solution) {
+            CellInfo *cell;
+            BelId bel;
+            PlaceStrength strength;
+            std::tie(cell, bel, strength) = sc;
+            ctx->bindBel(bel, cell, strength);
+        }
+
+        for (auto &cell : ctx->cells) {
+            if (cell.second->bel == BelId())
+                log_error("Found unbound cell %s\n", cell.first.c_str(ctx));
+            if (ctx->getBoundBelCell(cell.second->bel) != cell.second.get())
+                log_error("Found cell %s with mismatched binding\n", cell.first.c_str(ctx));
+            if (ctx->debug)
+                log_info("AP soln: %s -> %s\n", cell.first.c_str(ctx), ctx->nameOfBel(cell.second->bel));
+        }
+
+        bool any_bad_placements = false;
+        for (auto bel : ctx->getBels()) {
+            CellInfo *cell = ctx->getBoundBelCell(bel);
+            if (!ctx->isBelLocationValid(bel)) {
+                std::string cell_text = "no cell";
+                if (cell != nullptr)
+                    cell_text = std::string("cell '") + ctx->nameOf(cell) + "'";
+                log_warning("post-placement validity check failed for Bel '%s' "
+                            "(%s)\n",
+                            ctx->nameOfBel(bel), cell_text.c_str());
+                any_bad_placements = true;
+            }
+        }
+
+        if (any_bad_placements) {
+            return false;
+        }
+
+        auto endtt = std::chrono::high_resolution_clock::now();
+        log_info("HeAP Placer Time: %.02fs\n", std::chrono::duration<double>(endtt - startt).count());
+        log_info("  of which solving equations: %.02fs\n", solve_time);
+        log_info("  of which spreading cells: %.02fs\n", cl_time);
+        log_info("  of which strict legalisation: %.02fs\n", sl_time);
+
+        ctx->check();
+        lock.unlock_early();
+
+#if !defined(__wasm)
+        if (cfg.parallelRefine) {
+            if (!parallel_refine(ctx, ParallelRefineCfg(ctx))) {
+                return false;
+            }
+        } else
+#endif
+        {
+            if (!placer1_refine(ctx, Placer1Cfg(ctx))) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+  private:
+    Context *ctx;
+    PlacerHeapCfg cfg;
+
+    int max_x = 0, max_y = 0;
+    FastBels fast_bels;
+    dict<IdString, std::tuple<int, int>> bel_types;
+
+    TimingAnalyser tmg;
+
+    struct BoundingBox
+    {
+        // Actual bounding box
+        int x0 = 0, x1 = 0, y0 = 0, y1 = 0;
+    };
+
+    dict<IdString, BoundingBox> constraint_region_bounds;
+
+    // In some cases, we can't use bindBel because we allow overlap in the earlier stages. So we use this custom
+    // structure instead
+    struct CellLocation
+    {
+        int x, y;
+        int legal_x, legal_y;
+        double rawx, rawy;
+        bool locked, global;
+    };
+    dict<IdString, CellLocation> cell_locs;
+    // The set of cells that we will actually place. This excludes locked cells and children cells of macros/chains
+    // (only the root of each macro is placed.)
+    std::vector<CellInfo *> place_cells;
+
+    // The cells in the current equation being solved (a subset of place_cells in some cases, where we only place
+    // cells of a certain type)
+    std::vector<CellInfo *> solve_cells;
+
+    dict<ClusterId, std::vector<CellInfo *>> cluster2cells;
+    dict<ClusterId, int> chain_size;
+    // Performance counting
+    double solve_time = 0, cl_time = 0, sl_time = 0;
+
+    // Place cells with the BEL attribute set to constrain them
+    void place_constraints()
+    {
+        size_t placed_cells = 0;
+        // Initial constraints placer
+        for (auto &cell_entry : ctx->cells) {
+            CellInfo *cell = cell_entry.second.get();
+
+            auto loc = cell->attrs.find(ctx->id("BEL"));
+            if (loc != cell->attrs.end()) {
+                std::string loc_name = loc->second.as_string();
+                BelId bel = ctx->getBelByNameStr(loc_name);
+                if (bel == BelId()) {
+                    log_error("No Bel named \'%s\' located for "
+                              "this chip (processing BEL attribute on \'%s\')\n",
+                              loc_name.c_str(), cell->name.c_str(ctx));
+                }
+
+                if (!ctx->isValidBelForCellType(cell->type, bel)) {
+                    IdString bel_type = ctx->getBelType(bel);
+                    log_error("Bel \'%s\' of type \'%s\' does not match cell "
+                              "\'%s\' of type \'%s\'\n",
+                              loc_name.c_str(), bel_type.c_str(ctx), cell->name.c_str(ctx), cell->type.c_str(ctx));
+                }
+                auto bound_cell = ctx->getBoundBelCell(bel);
+                if (bound_cell) {
+                    log_error("Cell \'%s\' cannot be bound to bel \'%s\' since it is already bound to cell \'%s\'\n",
+                              cell->name.c_str(ctx), loc_name.c_str(), bound_cell->name.c_str(ctx));
+                }
+
+                ctx->bindBel(bel, cell, STRENGTH_USER);
+                if (!ctx->isBelLocationValid(bel)) {
+                    IdString bel_type = ctx->getBelType(bel);
+                    log_error("Bel \'%s\' of type \'%s\' is not valid for cell "
+                              "\'%s\' of type \'%s\'\n",
+                              loc_name.c_str(), bel_type.c_str(ctx), cell->name.c_str(ctx), cell->type.c_str(ctx));
+                }
+                placed_cells++;
+            }
+        }
+        log_info("Placed %d cells based on constraints.\n", int(placed_cells));
+        ctx->yield();
+    }
+
+    void build_fast_bels()
+    {
+        for (auto bel : ctx->getBels()) {
+            if (!ctx->checkBelAvail(bel))
+                continue;
+            Loc loc = ctx->getBelLocation(bel);
+            max_x = std::max(max_x, loc.x);
+            max_y = std::max(max_y, loc.y);
+        }
+
+        pool<IdString> cell_types_in_use;
+        pool<BelBucketId> buckets_in_use;
+        for (auto &cell : ctx->cells) {
+            IdString cell_type = cell.second->type;
+            cell_types_in_use.insert(cell_type);
+            BelBucketId bucket = ctx->getBelBucketForCellType(cell_type);
+            buckets_in_use.insert(bucket);
+        }
+
+        for (auto cell_type : cell_types_in_use) {
+            fast_bels.addCellType(cell_type);
+        }
+        for (auto bucket : buckets_in_use) {
+            fast_bels.addBelBucket(bucket);
+        }
+
+        // Determine bounding boxes of region constraints
+        for (auto &region : ctx->region) {
+            Region *r = region.second.get();
+            BoundingBox bb;
+            if (r->constr_bels) {
+                bb.x0 = std::numeric_limits<int>::max();
+                bb.x1 = std::numeric_limits<int>::min();
+                bb.y0 = std::numeric_limits<int>::max();
+                bb.y1 = std::numeric_limits<int>::min();
+                for (auto bel : r->bels) {
+                    Loc loc = ctx->getBelLocation(bel);
+                    bb.x0 = std::min(bb.x0, loc.x);
+                    bb.x1 = std::max(bb.x1, loc.x);
+                    bb.y0 = std::min(bb.y0, loc.y);
+                    bb.y1 = std::max(bb.y1, loc.y);
+                }
+            } else {
+                bb.x0 = 0;
+                bb.y0 = 0;
+                bb.x1 = max_x;
+                bb.y1 = max_y;
+            }
+            constraint_region_bounds[r->name] = bb;
+        }
+    }
+
+    // Build and solve in one direction
+    void build_solve_direction(bool yaxis, int iter)
+    {
+        for (int i = 0; i < 5; i++) {
+            EquationSystem<double> esx(solve_cells.size(), solve_cells.size());
+            build_equations(esx, yaxis, iter);
+            solve_equations(esx, yaxis);
+        }
+    }
+
+    // Check if a cell has any meaningful connectivity
+    bool has_connectivity(CellInfo *cell)
+    {
+        for (auto port : cell->ports) {
+            if (port.second.net != nullptr && port.second.net->driver.cell != nullptr &&
+                !port.second.net->users.empty())
+                return true;
+        }
+        return false;
+    }
+
+    // Build up a random initial placement, without regard to legality
+    // FIXME: Are there better approaches to the initial placement (e.g. greedy?)
+    void seed_placement()
+    {
+        pool<IdString> cell_types;
+        for (const auto &cell : ctx->cells) {
+            cell_types.insert(cell.second->type);
+        }
+
+        pool<BelId> bels_used;
+        dict<IdString, std::deque<BelId>> available_bels;
+
+        for (auto bel : ctx->getBels()) {
+            if (!ctx->checkBelAvail(bel)) {
+                continue;
+            }
+
+            for (auto cell_type : cell_types) {
+                if (ctx->isValidBelForCellType(cell_type, bel)) {
+                    available_bels[cell_type].push_back(bel);
+                }
+            }
+        }
+
+        for (auto &t : available_bels) {
+            ctx->shuffle(t.second.begin(), t.second.end());
+        }
+
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            if (ci->bel != BelId()) {
+                Loc loc = ctx->getBelLocation(ci->bel);
+                cell_locs[cell.first].x = loc.x;
+                cell_locs[cell.first].y = loc.y;
+                cell_locs[cell.first].locked = true;
+                cell_locs[cell.first].global = ctx->getBelGlobalBuf(ci->bel);
+            } else if (ci->cluster == ClusterId() || ctx->getClusterRootCell(ci->cluster) == ci) {
+                bool placed = false;
+                int attempt_count = 0;
+                while (!placed) {
+                    ++attempt_count;
+                    if (attempt_count > 25000) {
+                        log_error("Unable to find a placement location for cell '%s'\n", ci->name.c_str(ctx));
+                    }
+
+                    // Make sure this cell type is in the available BEL map at
+                    // all.
+                    if (!available_bels.count(ci->type)) {
+                        log_error("Unable to place cell '%s', no BELs remaining to implement cell type '%s'\n",
+                                  ci->name.c_str(ctx), ci->type.c_str(ctx));
+                    }
+
+                    // Find an unused BEL from bels_for_cell_type.
+                    auto &bels_for_cell_type = available_bels.at(ci->type);
+                    BelId bel;
+                    while (true) {
+                        if (bels_for_cell_type.empty()) {
+                            log_error("Unable to place cell '%s', no BELs remaining to implement cell type '%s'\n",
+                                      ci->name.c_str(ctx), ci->type.c_str(ctx));
+                        }
+
+                        BelId candidate_bel = bels_for_cell_type.back();
+                        bels_for_cell_type.pop_back();
+                        if (bels_used.count(candidate_bel)) {
+                            // candidate_bel has already been used by another
+                            // cell type, skip it.
+                            continue;
+                        }
+
+                        bel = candidate_bel;
+                        break;
+                    }
+
+                    Loc loc = ctx->getBelLocation(bel);
+                    cell_locs[cell.first].x = loc.x;
+                    cell_locs[cell.first].y = loc.y;
+                    cell_locs[cell.first].locked = false;
+                    cell_locs[cell.first].global = ctx->getBelGlobalBuf(bel);
+
+                    // FIXME
+                    if (has_connectivity(cell.second.get()) && !cfg.ioBufTypes.count(ci->type)) {
+                        bels_used.insert(bel);
+                        place_cells.push_back(ci);
+                        placed = true;
+                    } else {
+                        ctx->bindBel(bel, ci, STRENGTH_STRONG);
+                        if (ctx->isBelLocationValid(bel)) {
+                            cell_locs[cell.first].locked = true;
+                            placed = true;
+                            bels_used.insert(bel);
+                        } else {
+                            ctx->unbindBel(bel);
+                            available_bels.at(ci->type).push_front(bel);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Setup the cells to be solved, returns the number of rows
+    int setup_solve_cells(pool<BelBucketId> *buckets = nullptr)
+    {
+        int row = 0;
+        solve_cells.clear();
+        // First clear the udata of all cells
+        for (auto &cell : ctx->cells)
+            cell.second->udata = dont_solve;
+        // Then update cells to be placed, which excludes cell children
+        for (auto cell : place_cells) {
+            if (buckets && !buckets->count(ctx->getBelBucketForCellType(cell->type)))
+                continue;
+            cell->udata = row++;
+            solve_cells.push_back(cell);
+        }
+        // Finally, update the udata of children
+        for (auto &cluster : cluster2cells)
+            for (auto child : cluster.second)
+                child->udata = ctx->getClusterRootCell(cluster.first)->udata;
+        return row;
+    }
+
+    // Update all chains
+    void update_all_chains()
+    {
+        for (auto cell : place_cells) {
+            chain_size[cell->name] = 1;
+            if (cell->cluster != ClusterId()) {
+                const auto base = cell_locs[cell->name];
+                for (auto child : cluster2cells.at(cell->cluster)) {
+                    if (child->type == cell->type && child != cell)
+                        chain_size[cell->name]++;
+                    Loc offset = ctx->getClusterOffset(child);
+                    cell_locs[child->name].x = std::max(0, std::min(max_x, base.x + offset.x));
+                    cell_locs[child->name].y = std::max(0, std::min(max_y, base.y + offset.y));
+                }
+            }
+        }
+    }
+
+    // Run a function on all ports of a net - including the driver and all users
+    template <typename Tf> void foreach_port(NetInfo *net, Tf func)
+    {
+        if (net->driver.cell != nullptr)
+            func(net->driver, store_index<PortRef>());
+        for (auto usr : net->users.enumerate())
+            func(usr.value, usr.index);
+    }
+
+    // Build the system of equations for either X or Y
+    void build_equations(EquationSystem<double> &es, bool yaxis, int iter = -1)
+    {
+        // Return the x or y position of a cell, depending on ydir
+        auto cell_pos = [&](CellInfo *cell) { return yaxis ? cell_locs.at(cell->name).y : cell_locs.at(cell->name).x; };
+        auto legal_pos = [&](CellInfo *cell) {
+            return yaxis ? cell_locs.at(cell->name).legal_y : cell_locs.at(cell->name).legal_x;
+        };
+
+        es.reset();
+
+        for (auto &net : ctx->nets) {
+            NetInfo *ni = net.second.get();
+            if (ni->driver.cell == nullptr)
+                continue;
+            if (ni->users.empty())
+                continue;
+            if (cell_locs.at(ni->driver.cell->name).global)
+                continue;
+            // Find the bounds of the net in this axis, and the ports that correspond to these bounds
+            PortRef *lbport = nullptr, *ubport = nullptr;
+            int lbpos = std::numeric_limits<int>::max(), ubpos = std::numeric_limits<int>::min();
+            foreach_port(ni, [&](PortRef &port, store_index<PortRef> user_idx) {
+                int pos = cell_pos(port.cell);
+                if (pos < lbpos) {
+                    lbpos = pos;
+                    lbport = &port;
+                }
+                if (pos > ubpos) {
+                    ubpos = pos;
+                    ubport = &port;
+                }
+            });
+            NPNR_ASSERT(lbport != nullptr);
+            NPNR_ASSERT(ubport != nullptr);
+
+            auto stamp_equation = [&](PortRef &var, PortRef &eqn, double weight) {
+                if (eqn.cell->udata == dont_solve)
+                    return;
+                int row = eqn.cell->udata;
+                int v_pos = cell_pos(var.cell);
+                if (var.cell->udata != dont_solve) {
+                    es.add_coeff(row, var.cell->udata, weight);
+                } else {
+                    es.add_rhs(row, -v_pos * weight);
+                }
+                if (var.cell->cluster != ClusterId()) {
+                    Loc offset = ctx->getClusterOffset(var.cell);
+                    es.add_rhs(row, -(yaxis ? offset.y : offset.x) * weight);
+                }
+            };
+
+            // Add all relevant connections to the matrix
+            foreach_port(ni, [&](PortRef &port, store_index<PortRef> user_idx) {
+                int this_pos = cell_pos(port.cell);
+                auto process_arc = [&](PortRef *other) {
+                    if (other == &port)
+                        return;
+                    int o_pos = cell_pos(other->cell);
+                    double weight = 1.0 / (ni->users.entries() *
+                                           std::max<double>(1, (yaxis ? cfg.hpwl_scale_y : cfg.hpwl_scale_x) *
+                                                                       std::abs(o_pos - this_pos)));
+
+                    if (user_idx) {
+                        weight *= (1.0 + cfg.timingWeight * std::pow(tmg.get_criticality(CellPortKey(port)),
+                                                                     cfg.criticalityExponent));
+                    }
+
+                    // If cell 0 is not fixed, it will stamp +w on its equation and -w on the other end's equation,
+                    // if the other end isn't fixed
+                    stamp_equation(port, port, weight);
+                    stamp_equation(port, *other, -weight);
+                    stamp_equation(*other, *other, weight);
+                    stamp_equation(*other, port, -weight);
+                };
+                process_arc(lbport);
+                process_arc(ubport);
+            });
+        }
+        if (iter != -1) {
+            float alpha = cfg.alpha;
+            for (size_t row = 0; row < solve_cells.size(); row++) {
+                int l_pos = legal_pos(solve_cells.at(row));
+                int c_pos = cell_pos(solve_cells.at(row));
+
+                double weight =
+                        alpha * iter /
+                        std::max<double>(1, (yaxis ? cfg.hpwl_scale_y : cfg.hpwl_scale_x) * std::abs(l_pos - c_pos));
+                // Add an arc from legalised to current position
+                es.add_coeff(row, row, weight);
+                es.add_rhs(row, weight * l_pos);
+            }
+        }
+    }
+
+    // Build the system of equations for either X or Y
+    void solve_equations(EquationSystem<double> &es, bool yaxis)
+    {
+        // Return the x or y position of a cell, depending on ydir
+        auto cell_pos = [&](CellInfo *cell) { return yaxis ? cell_locs.at(cell->name).y : cell_locs.at(cell->name).x; };
+        std::vector<double> vals;
+        std::transform(solve_cells.begin(), solve_cells.end(), std::back_inserter(vals), cell_pos);
+        es.solve(vals, cfg.solverTolerance);
+        for (size_t i = 0; i < vals.size(); i++)
+            if (yaxis) {
+                cell_locs.at(solve_cells.at(i)->name).rawy = vals.at(i);
+                cell_locs.at(solve_cells.at(i)->name).y = std::min(max_y, std::max(0, int(vals.at(i))));
+                if (solve_cells.at(i)->region != nullptr)
+                    cell_locs.at(solve_cells.at(i)->name).y =
+                            limit_to_reg(solve_cells.at(i)->region, cell_locs.at(solve_cells.at(i)->name).y, true);
+            } else {
+                cell_locs.at(solve_cells.at(i)->name).rawx = vals.at(i);
+                cell_locs.at(solve_cells.at(i)->name).x = std::min(max_x, std::max(0, int(vals.at(i))));
+                if (solve_cells.at(i)->region != nullptr)
+                    cell_locs.at(solve_cells.at(i)->name).x =
+                            limit_to_reg(solve_cells.at(i)->region, cell_locs.at(solve_cells.at(i)->name).x, false);
+            }
+    }
+
+    // Compute HPWL
+    wirelen_t total_hpwl()
+    {
+        wirelen_t hpwl = 0;
+        for (auto &net : ctx->nets) {
+            NetInfo *ni = net.second.get();
+            if (ni->driver.cell == nullptr)
+                continue;
+            CellLocation &drvloc = cell_locs.at(ni->driver.cell->name);
+            if (drvloc.global)
+                continue;
+            int xmin = drvloc.x, xmax = drvloc.x, ymin = drvloc.y, ymax = drvloc.y;
+            for (auto &user : ni->users) {
+                CellLocation &usrloc = cell_locs.at(user.cell->name);
+                xmin = std::min(xmin, usrloc.x);
+                xmax = std::max(xmax, usrloc.x);
+                ymin = std::min(ymin, usrloc.y);
+                ymax = std::max(ymax, usrloc.y);
+            }
+            hpwl += cfg.hpwl_scale_x * (xmax - xmin) + cfg.hpwl_scale_y * (ymax - ymin);
+        }
+        return hpwl;
+    }
+
+    // Strict placement legalisation, performed after the initial HeAP spreading
+    void legalise_placement_strict(bool require_validity = false)
+    {
+        auto startt = std::chrono::high_resolution_clock::now();
+
+        // Unbind all cells placed in this solution
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            if (ci->bel != BelId() &&
+                (ci->udata != dont_solve ||
+                 (ci->cluster != ClusterId() && ctx->getClusterRootCell(ci->cluster)->udata != dont_solve)))
+                ctx->unbindBel(ci->bel);
+        }
+
+        // At the moment we don't follow the full HeAP algorithm using cuts for legalisation, instead using
+        // the simple greedy largest-macro-first approach.
+        std::priority_queue<std::pair<int, IdString>> remaining;
+        for (auto cell : solve_cells) {
+            remaining.emplace(chain_size[cell->name], cell->name);
+        }
+        int ripup_radius = 2;
+        int total_iters = 0;
+        int total_iters_noreset = 0;
+        while (!remaining.empty()) {
+            auto top = remaining.top();
+            remaining.pop();
+
+            CellInfo *ci = ctx->cells.at(top.second).get();
+            // Was now placed, ignore
+            if (ci->bel != BelId())
+                continue;
+            // log_info("   Legalising %s (%s)\n", top.second.c_str(ctx), ci->type.c_str(ctx));
+            FastBels::FastBelsData *fb;
+            fast_bels.getBelsForCellType(ci->type, &fb);
+            int radius = 0;
+            int iter = 0;
+            int iter_at_radius = 0;
+            bool placed = false;
+            BelId bestBel;
+            int best_inp_len = std::numeric_limits<int>::max();
+
+            total_iters++;
+            total_iters_noreset++;
+            if (total_iters > int(solve_cells.size())) {
+                total_iters = 0;
+                ripup_radius = std::max(std::max(max_x, max_y), ripup_radius * 2);
+            }
+
+            if (total_iters_noreset > std::max(5000, 8 * int(ctx->cells.size()))) {
+                log_error("Unable to find legal placement for all cells, design is probably at utilisation limit.\n");
+            }
+
+            while (!placed) {
+
+                // Set a conservative timeout
+                if (iter > std::max(10000, 3 * int(ctx->cells.size())))
+                    log_error("Unable to find legal placement for cell '%s', check constraints and utilisation.\n",
+                              ctx->nameOf(ci));
+
+                // Determine a search radius around the solver location (which increases over time) that is clamped to
+                // the region constraint for the cell (if applicable)
+                int rx = radius, ry = radius;
+
+                if (ci->region != nullptr) {
+                    rx = std::min(radius, (constraint_region_bounds[ci->region->name].x1 -
+                                           constraint_region_bounds[ci->region->name].x0) /
+                                                          2 +
+                                                  1);
+                    ry = std::min(radius, (constraint_region_bounds[ci->region->name].y1 -
+                                           constraint_region_bounds[ci->region->name].y0) /
+                                                          2 +
+                                                  1);
+                }
+
+                // Pick a random X and Y location within our search radius
+                int nx = ctx->rng(2 * rx + 1) + std::max(cell_locs.at(ci->name).x - rx, 0);
+                int ny = ctx->rng(2 * ry + 1) + std::max(cell_locs.at(ci->name).y - ry, 0);
+
+                iter++;
+                iter_at_radius++;
+                if (iter >= (10 * (radius + 1))) {
+                    // No luck yet, increase radius
+                    radius = std::min(std::max(max_x, max_y), radius + 1);
+                    while (radius < std::max(max_x, max_y)) {
+                        // Keep increasing the radius until it will actually increase the number of cells we are
+                        // checking (e.g. BRAM and DSP will not be in all cols/rows), so we don't waste effort
+                        for (int x = std::max(0, cell_locs.at(ci->name).x - radius);
+                             x <= std::min(max_x, cell_locs.at(ci->name).x + radius); x++) {
+                            if (x >= int(fb->size()))
+                                break;
+                            for (int y = std::max(0, cell_locs.at(ci->name).y - radius);
+                                 y <= std::min(max_y, cell_locs.at(ci->name).y + radius); y++) {
+                                if (y >= int(fb->at(x).size()))
+                                    break;
+                                if (fb->at(x).at(y).size() > 0)
+                                    goto notempty;
+                            }
+                        }
+                        radius = std::min(std::max(max_x, max_y), radius + 1);
+                    }
+                notempty:
+                    iter_at_radius = 0;
+                    iter = 0;
+                }
+                // If our randomly chosen cooridnate is out of bounds; or points to a tile with no relevant bels; ignore
+                // it
+                if (nx < 0 || nx > max_x)
+                    continue;
+                if (ny < 0 || ny > max_y)
+                    continue;
+
+                if (nx >= int(fb->size()))
+                    continue;
+                if (ny >= int(fb->at(nx).size()))
+                    continue;
+                if (fb->at(nx).at(ny).empty())
+                    continue;
+
+                // The number of attempts to find a location to try
+                int need_to_explore = 2 * radius;
+
+                // If we have found at least one legal location; and made enough attempts; assume it's good enough and
+                // finish
+                if (iter_at_radius >= need_to_explore && bestBel != BelId()) {
+                    CellInfo *bound = ctx->getBoundBelCell(bestBel);
+                    if (bound != nullptr) {
+                        ctx->unbindBel(bound->bel);
+                        remaining.emplace(chain_size[bound->name], bound->name);
+                    }
+                    ctx->bindBel(bestBel, ci, STRENGTH_WEAK);
+                    placed = true;
+                    Loc loc = ctx->getBelLocation(bestBel);
+                    cell_locs[ci->name].x = loc.x;
+                    cell_locs[ci->name].y = loc.y;
+                    break;
+                }
+
+                if (ci->cluster == ClusterId()) {
+                    // The case where we have no relative constraints
+                    for (auto sz : fb->at(nx).at(ny)) {
+                        // Look through all bels in this tile; checking region constraint if applicable
+                        if (!ci->testRegion(sz))
+                            continue;
+                        // Prefer available bels; unless we are dealing with a wide radius (e.g. difficult control sets)
+                        // or occasionally trigger a tiebreaker
+                        if (ctx->checkBelAvail(sz) || (radius > ripup_radius || ctx->rng(20000) < 10)) {
+                            CellInfo *bound = ctx->getBoundBelCell(sz);
+                            if (bound != nullptr) {
+                                // Only rip up cells without constraints
+                                if (bound->cluster != ClusterId())
+                                    continue;
+                                ctx->unbindBel(bound->bel);
+                            }
+                            // Provisionally bind the bel
+                            ctx->bindBel(sz, ci, STRENGTH_WEAK);
+                            if (require_validity && !ctx->isBelLocationValid(sz)) {
+                                // New location is not legal; unbind the cell (and rebind the cell we ripped up if
+                                // applicable)
+                                ctx->unbindBel(sz);
+                                if (bound != nullptr)
+                                    ctx->bindBel(sz, bound, STRENGTH_WEAK);
+                            } else if (iter_at_radius < need_to_explore) {
+                                // It's legal, but we haven't tried enough locations yet
+                                ctx->unbindBel(sz);
+                                if (bound != nullptr)
+                                    ctx->bindBel(sz, bound, STRENGTH_WEAK);
+                                int input_len = 0;
+                                // Compute a fast input wirelength metric at this bel; and save if better than our last
+                                // try
+                                for (auto &port : ci->ports) {
+                                    auto &p = port.second;
+                                    if (p.type != PORT_IN || p.net == nullptr || p.net->driver.cell == nullptr)
+                                        continue;
+                                    CellInfo *drv = p.net->driver.cell;
+                                    auto drv_loc = cell_locs.find(drv->name);
+                                    if (drv_loc == cell_locs.end())
+                                        continue;
+                                    if (drv_loc->second.global)
+                                        continue;
+                                    input_len += std::abs(drv_loc->second.x - nx) + std::abs(drv_loc->second.y - ny);
+                                }
+                                if (input_len < best_inp_len) {
+                                    best_inp_len = input_len;
+                                    bestBel = sz;
+                                }
+                                break;
+                            } else {
+                                // It's legal, and we've tried enough. Finish.
+                                if (bound != nullptr)
+                                    remaining.emplace(chain_size[bound->name], bound->name);
+                                Loc loc = ctx->getBelLocation(sz);
+                                cell_locs[ci->name].x = loc.x;
+                                cell_locs[ci->name].y = loc.y;
+                                placed = true;
+                                break;
+                            }
+                        }
+                    }
+                } else {
+                    // We do have relative constraints
+                    for (auto sz : fb->at(nx).at(ny)) {
+                        // List of cells and their destination
+                        std::vector<std::pair<CellInfo *, BelId>> targets;
+                        // List of bels we placed things at; and the cell that was there before if applicable
+                        std::vector<std::pair<BelId, CellInfo *>> swaps_made;
+
+                        if (!ctx->getClusterPlacement(ci->cluster, sz, targets))
+                            continue;
+
+                        for (auto &target : targets) {
+                            // Check it satisfies the region constraint if applicable
+                            if (!target.first->testRegion(target.second))
+                                goto fail;
+                            CellInfo *bound = ctx->getBoundBelCell(target.second);
+                            // Chains cannot overlap; so if we have to ripup a cell make sure it isn't part of a chain
+                            if (bound != nullptr)
+                                if (bound->cluster != ClusterId() || bound->belStrength > STRENGTH_WEAK)
+                                    goto fail;
+                        }
+                        // Actually perform the move; keeping track of the moves we make so we can revert them if needed
+                        for (auto &target : targets) {
+                            CellInfo *bound = ctx->getBoundBelCell(target.second);
+                            if (bound != nullptr)
+                                ctx->unbindBel(target.second);
+                            ctx->bindBel(target.second, target.first, STRENGTH_STRONG);
+                            swaps_made.emplace_back(target.second, bound);
+                        }
+                        // Check that the move we have made is legal
+                        for (auto &sm : swaps_made) {
+                            if (!ctx->isBelLocationValid(sm.first))
+                                goto fail;
+                        }
+
+                        if (false) {
+                        fail:
+                            // If the move turned out to be illegal; revert all the moves we made
+                            for (auto &swap : swaps_made) {
+                                ctx->unbindBel(swap.first);
+                                if (swap.second != nullptr)
+                                    ctx->bindBel(swap.first, swap.second, STRENGTH_WEAK);
+                            }
+                            continue;
+                        }
+                        for (auto &target : targets) {
+                            Loc loc = ctx->getBelLocation(target.second);
+                            cell_locs[target.first->name].x = loc.x;
+                            cell_locs[target.first->name].y = loc.y;
+                            // log_info("%s %d %d %d\n", target.first->name.c_str(ctx), loc.x, loc.y, loc.z);
+                        }
+                        for (auto &swap : swaps_made) {
+                            // Where we have ripped up cells; add them to the queue
+                            if (swap.second != nullptr)
+                                remaining.emplace(chain_size[swap.second->name], swap.second->name);
+                        }
+
+                        placed = true;
+                        break;
+                    }
+                }
+            }
+        }
+        auto endt = std::chrono::high_resolution_clock::now();
+        sl_time += std::chrono::duration<float>(endt - startt).count();
+    }
+    // Implementation of the cut-based spreading as described in the HeAP/SimPL papers
+
+    template <typename T> T limit_to_reg(Region *reg, T val, bool dir)
+    {
+        if (reg == nullptr)
+            return val;
+        int limit_low = dir ? constraint_region_bounds[reg->name].y0 : constraint_region_bounds[reg->name].x0;
+        int limit_high = dir ? constraint_region_bounds[reg->name].y1 : constraint_region_bounds[reg->name].x1;
+        return std::max<T>(std::min<T>(val, limit_high), limit_low);
+    }
+
+    struct ChainExtent
+    {
+        int x0, y0, x1, y1;
+    };
+
+    struct SpreaderRegion
+    {
+        int id;
+        int x0, y0, x1, y1;
+        std::vector<int> cells, bels;
+        bool overused(float beta) const
+        {
+            for (size_t t = 0; t < cells.size(); t++) {
+                if (bels.at(t) < 4) {
+                    if (cells.at(t) > bels.at(t))
+                        return true;
+                } else {
+                    if (cells.at(t) > beta * bels.at(t))
+                        return true;
+                }
+            }
+            return false;
+        }
+    };
+
+    class CutSpreader
+    {
+      public:
+        CutSpreader(HeAPPlacer *p, const pool<BelBucketId> &buckets) : p(p), ctx(p->ctx), buckets(buckets)
+        {
+            // Get fast BELs data for all buckets being Cut/Spread.
+            size_t idx = 0;
+            for (BelBucketId bucket : buckets) {
+                type_index[bucket] = idx;
+                FastBels::FastBelsData *fast_bels;
+                p->fast_bels.getBelsForBelBucket(bucket, &fast_bels);
+                fb.push_back(fast_bels);
+                ++idx;
+                NPNR_ASSERT(fb.size() == idx);
+            }
+        }
+        static int seq;
+        void run()
+        {
+            auto startt = std::chrono::high_resolution_clock::now();
+            init();
+            find_overused_regions();
+            for (auto &r : regions) {
+                if (merged_regions.count(r.id))
+                    continue;
+#if 0
+                log_info("%s (%d, %d) |_> (%d, %d) %d/%d\n", beltype.c_str(ctx), r.x0, r.y0, r.x1, r.y1, r.cells,
+                         r.bels);
+#endif
+            }
+            expand_regions();
+            std::queue<std::pair<int, bool>> workqueue;
+#if 0
+            std::vector<std::pair<double, double>> orig;
+            if (ctx->debug)
+                for (auto c : p->solve_cells)
+                    orig.emplace_back(p->cell_locs[c->name].rawx, p->cell_locs[c->name].rawy);
+#endif
+            for (auto &r : regions) {
+                if (merged_regions.count(r.id))
+                    continue;
+#if 0
+                for (auto t : sorted(beltype)) {
+                    log_info("%s (%d, %d) |_> (%d, %d) %d/%d\n", t.c_str(ctx), r.x0, r.y0, r.x1, r.y1,
+                             r.cells.at(type_index.at(t)), r.bels.at(type_index.at(t)));
+                }
+
+#endif
+                workqueue.emplace(r.id, false);
+            }
+            while (!workqueue.empty()) {
+                auto front = workqueue.front();
+                workqueue.pop();
+                auto &r = regions.at(front.first);
+                if (std::all_of(r.cells.begin(), r.cells.end(), [](int x) { return x == 0; }))
+                    continue;
+                auto res = cut_region(r, front.second);
+                if (res) {
+                    workqueue.emplace(res->first, !front.second);
+                    workqueue.emplace(res->second, !front.second);
+                } else {
+                    // Try the other dir, in case stuck in one direction only
+                    auto res2 = cut_region(r, !front.second);
+                    if (res2) {
+                        workqueue.emplace(res2->first, front.second);
+                        workqueue.emplace(res2->second, front.second);
+                    }
+                }
+            }
+#if 0
+            if (ctx->debug) {
+                std::ofstream sp("spread" + std::to_string(seq) + ".csv");
+                for (size_t i = 0; i < p->solve_cells.size(); i++) {
+                    auto &c = p->solve_cells.at(i);
+                    if (c->type != beltype)
+                        continue;
+                    sp << orig.at(i).first << "," << orig.at(i).second << "," << p->cell_locs[c->name].rawx << "," << p->cell_locs[c->name].rawy << std::endl;
+                }
+                std::ofstream oc("cells" + std::to_string(seq) + ".csv");
+                for (size_t y = 0; y <= p->max_y; y++) {
+                    for (size_t x = 0; x <= p->max_x; x++) {
+                        oc << cells_at_location.at(x).at(y).size() << ", ";
+                    }
+                    oc << std::endl;
+                }
+                ++seq;
+            }
+#endif
+            auto endt = std::chrono::high_resolution_clock::now();
+            p->cl_time += std::chrono::duration<float>(endt - startt).count();
+        }
+
+      private:
+        HeAPPlacer *p;
+        Context *ctx;
+        pool<BelBucketId> buckets;
+        dict<BelBucketId, size_t> type_index;
+        std::vector<std::vector<std::vector<int>>> occupancy;
+        std::vector<std::vector<int>> groups;
+        std::vector<std::vector<ChainExtent>> chaines;
+        std::map<IdString, ChainExtent> cell_extents;
+
+        std::vector<std::vector<std::vector<std::vector<BelId>>> *> fb;
+
+        std::vector<SpreaderRegion> regions;
+        pool<int> merged_regions;
+        // Cells at a location, sorted by real (not integer) x and y
+        std::vector<std::vector<std::vector<CellInfo *>>> cells_at_location;
+
+        int occ_at(int x, int y, int type) { return occupancy.at(x).at(y).at(type); }
+
+        int bels_at(int x, int y, int type)
+        {
+            if (x >= int(fb.at(type)->size()) || y >= int(fb.at(type)->at(x).size()))
+                return 0;
+            return int(fb.at(type)->at(x).at(y).size());
+        }
+
+        bool is_cell_fixed(const CellInfo &cell) const
+        {
+            return buckets.count(ctx->getBelBucketForCellType(cell.type)) == 0;
+        }
+
+        size_t cell_index(const CellInfo &cell) const { return type_index.at(ctx->getBelBucketForCellType(cell.type)); }
+
+        void init()
+        {
+            occupancy.resize(p->max_x + 1,
+                             std::vector<std::vector<int>>(p->max_y + 1, std::vector<int>(buckets.size(), 0)));
+            groups.resize(p->max_x + 1, std::vector<int>(p->max_y + 1, -1));
+            chaines.resize(p->max_x + 1, std::vector<ChainExtent>(p->max_y + 1));
+            cells_at_location.resize(p->max_x + 1, std::vector<std::vector<CellInfo *>>(p->max_y + 1));
+            for (int x = 0; x <= p->max_x; x++)
+                for (int y = 0; y <= p->max_y; y++) {
+                    for (int t = 0; t < int(buckets.size()); t++) {
+                        occupancy.at(x).at(y).at(t) = 0;
+                    }
+                    groups.at(x).at(y) = -1;
+                    chaines.at(x).at(y) = {x, y, x, y};
+                }
+
+            auto set_chain_ext = [&](IdString cell, int x, int y) {
+                if (!cell_extents.count(cell))
+                    cell_extents[cell] = {x, y, x, y};
+                else {
+                    cell_extents[cell].x0 = std::min(cell_extents[cell].x0, x);
+                    cell_extents[cell].y0 = std::min(cell_extents[cell].y0, y);
+                    cell_extents[cell].x1 = std::max(cell_extents[cell].x1, x);
+                    cell_extents[cell].y1 = std::max(cell_extents[cell].y1, y);
+                }
+            };
+
+            for (auto &cell_loc : p->cell_locs) {
+                IdString cell_name = cell_loc.first;
+                const CellInfo &cell = *ctx->cells.at(cell_name);
+                const CellLocation &loc = cell_loc.second;
+                if (is_cell_fixed(cell)) {
+                    continue;
+                }
+
+                if (cell.belStrength > STRENGTH_STRONG) {
+                    continue;
+                }
+
+                occupancy.at(cell_loc.second.x).at(cell_loc.second.y).at(cell_index(cell))++;
+
+                // Compute ultimate extent of each chain root
+                if (cell.cluster != ClusterId()) {
+                    set_chain_ext(ctx->getClusterRootCell(cell.cluster)->name, loc.x, loc.y);
+                }
+            }
+
+            for (auto &cell_loc : p->cell_locs) {
+                IdString cell_name = cell_loc.first;
+                const CellInfo &cell = *ctx->cells.at(cell_name);
+                const CellLocation &loc = cell_loc.second;
+                if (is_cell_fixed(cell)) {
+                    continue;
+                }
+
+                if (cell.belStrength > STRENGTH_STRONG) {
+                    continue;
+                }
+
+                // Transfer chain extents to the actual chains structure
+                ChainExtent *ce = nullptr;
+                if (cell.cluster != ClusterId()) {
+                    ce = &(cell_extents.at(ctx->getClusterRootCell(cell.cluster)->name));
+                }
+
+                if (ce) {
+                    auto &lce = chaines.at(loc.x).at(loc.y);
+                    lce.x0 = std::min(lce.x0, ce->x0);
+                    lce.y0 = std::min(lce.y0, ce->y0);
+                    lce.x1 = std::max(lce.x1, ce->x1);
+                    lce.y1 = std::max(lce.y1, ce->y1);
+                }
+            }
+
+            for (auto cell : p->solve_cells) {
+                if (is_cell_fixed(*cell)) {
+                    continue;
+                }
+
+                cells_at_location.at(p->cell_locs.at(cell->name).x).at(p->cell_locs.at(cell->name).y).push_back(cell);
+            }
+        }
+
+        void merge_regions(SpreaderRegion &merged, SpreaderRegion &mergee)
+        {
+            // Prevent grow_region from recursing while doing this
+            for (int x = mergee.x0; x <= mergee.x1; x++) {
+                for (int y = mergee.y0; y <= mergee.y1; y++) {
+                    // log_info("%d %d\n", groups.at(x).at(y), mergee.id);
+                    NPNR_ASSERT(groups.at(x).at(y) == mergee.id);
+                    groups.at(x).at(y) = merged.id;
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        merged.cells.at(t) += occ_at(x, y, t);
+                        merged.bels.at(t) += bels_at(x, y, t);
+                    }
+                }
+            }
+
+            merged_regions.insert(mergee.id);
+            grow_region(merged, mergee.x0, mergee.y0, mergee.x1, mergee.y1);
+        }
+
+        void grow_region(SpreaderRegion &r, int x0, int y0, int x1, int y1, bool init = false)
+        {
+            // log_info("growing to (%d, %d) |_> (%d, %d)\n", x0, y0, x1, y1);
+            if ((x0 >= r.x0 && y0 >= r.y0 && x1 <= r.x1 && y1 <= r.y1) || init)
+                return;
+            int old_x0 = r.x0 + (init ? 1 : 0), old_y0 = r.y0, old_x1 = r.x1, old_y1 = r.y1;
+            r.x0 = std::min(r.x0, x0);
+            r.y0 = std::min(r.y0, y0);
+            r.x1 = std::max(r.x1, x1);
+            r.y1 = std::max(r.y1, y1);
+
+            auto process_location = [&](int x, int y) {
+                // Merge with any overlapping regions
+                if (groups.at(x).at(y) == -1) {
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        r.bels.at(t) += bels_at(x, y, t);
+                        r.cells.at(t) += occ_at(x, y, t);
+                    }
+                }
+
+                if (groups.at(x).at(y) != -1 && groups.at(x).at(y) != r.id)
+                    merge_regions(r, regions.at(groups.at(x).at(y)));
+                groups.at(x).at(y) = r.id;
+                // Grow to cover any chains
+                auto &chaine = chaines.at(x).at(y);
+                grow_region(r, chaine.x0, chaine.y0, chaine.x1, chaine.y1);
+            };
+            for (int x = r.x0; x < old_x0; x++)
+                for (int y = r.y0; y <= r.y1; y++)
+                    process_location(x, y);
+            for (int x = old_x1 + 1; x <= x1; x++)
+                for (int y = r.y0; y <= r.y1; y++)
+                    process_location(x, y);
+            for (int y = r.y0; y < old_y0; y++)
+                for (int x = r.x0; x <= r.x1; x++)
+                    process_location(x, y);
+            for (int y = old_y1 + 1; y <= r.y1; y++)
+                for (int x = r.x0; x <= r.x1; x++)
+                    process_location(x, y);
+        }
+
+        void find_overused_regions()
+        {
+            for (int x = 0; x <= p->max_x; x++)
+                for (int y = 0; y <= p->max_y; y++) {
+                    // Either already in a group, or not overutilised. Ignore
+                    if (groups.at(x).at(y) != -1)
+                        continue;
+                    bool overutilised = false;
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        if (occ_at(x, y, t) > bels_at(x, y, t)) {
+                            overutilised = true;
+                            break;
+                        }
+                    }
+
+                    if (!overutilised)
+                        continue;
+                    // log_info("%d %d %d\n", x, y, occ_at(x, y));
+                    int id = int(regions.size());
+                    groups.at(x).at(y) = id;
+                    SpreaderRegion reg;
+                    reg.id = id;
+                    reg.x0 = reg.x1 = x;
+                    reg.y0 = reg.y1 = y;
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        reg.bels.push_back(bels_at(x, y, t));
+                        reg.cells.push_back(occ_at(x, y, t));
+                    }
+                    // Make sure we cover carries, etc
+                    grow_region(reg, reg.x0, reg.y0, reg.x1, reg.y1, true);
+
+                    bool expanded = true;
+                    while (expanded) {
+                        expanded = false;
+                        // Keep trying expansion in x and y, until we find no over-occupancy cells
+                        // or hit grouped cells
+
+                        // First try expanding in x
+                        if (reg.x1 < p->max_x) {
+                            bool over_occ_x = false;
+                            for (int y1 = reg.y0; y1 <= reg.y1; y1++) {
+                                for (size_t t = 0; t < buckets.size(); t++) {
+                                    if (occ_at(reg.x1 + 1, y1, t) > bels_at(reg.x1 + 1, y1, t)) {
+                                        over_occ_x = true;
+                                        break;
+                                    }
+                                }
+                            }
+                            if (over_occ_x) {
+                                expanded = true;
+                                grow_region(reg, reg.x0, reg.y0, reg.x1 + 1, reg.y1);
+                            }
+                        }
+
+                        if (reg.y1 < p->max_y) {
+                            bool over_occ_y = false;
+                            for (int x1 = reg.x0; x1 <= reg.x1; x1++) {
+                                for (size_t t = 0; t < buckets.size(); t++) {
+                                    if (occ_at(x1, reg.y1 + 1, t) > bels_at(x1, reg.y1 + 1, t)) {
+                                        over_occ_y = true;
+                                        break;
+                                    }
+                                }
+                            }
+                            if (over_occ_y) {
+                                expanded = true;
+                                grow_region(reg, reg.x0, reg.y0, reg.x1, reg.y1 + 1);
+                            }
+                        }
+                    }
+                    regions.push_back(reg);
+                }
+        }
+
+        void expand_regions()
+        {
+            std::queue<int> overu_regions;
+            float beta = p->cfg.beta;
+            for (auto &r : regions) {
+                if (!merged_regions.count(r.id) && r.overused(beta))
+                    overu_regions.push(r.id);
+            }
+            while (!overu_regions.empty()) {
+                int rid = overu_regions.front();
+                overu_regions.pop();
+                if (merged_regions.count(rid))
+                    continue;
+                auto &reg = regions.at(rid);
+                while (reg.overused(beta)) {
+                    bool changed = false;
+                    for (int j = 0; j < p->cfg.spread_scale_x; j++) {
+                        if (reg.x0 > 0) {
+                            grow_region(reg, reg.x0 - 1, reg.y0, reg.x1, reg.y1);
+                            changed = true;
+                            if (!reg.overused(beta))
+                                break;
+                        }
+                        if (reg.x1 < p->max_x) {
+                            grow_region(reg, reg.x0, reg.y0, reg.x1 + 1, reg.y1);
+                            changed = true;
+                            if (!reg.overused(beta))
+                                break;
+                        }
+                    }
+                    for (int j = 0; j < p->cfg.spread_scale_y; j++) {
+                        if (reg.y0 > 0) {
+                            grow_region(reg, reg.x0, reg.y0 - 1, reg.x1, reg.y1);
+                            changed = true;
+                            if (!reg.overused(beta))
+                                break;
+                        }
+                        if (reg.y1 < p->max_y) {
+                            grow_region(reg, reg.x0, reg.y0, reg.x1, reg.y1 + 1);
+                            changed = true;
+                            if (!reg.overused(beta))
+                                break;
+                        }
+                    }
+                    if (!changed) {
+                        for (auto bucket : buckets) {
+                            if (reg.cells > reg.bels) {
+                                IdString bucket_name = ctx->getBelBucketName(bucket);
+                                log_error("Failed to expand region (%d, %d) |_> (%d, %d) of %d %ss\n", reg.x0, reg.y0,
+                                          reg.x1, reg.y1, reg.cells.at(type_index.at(bucket)), bucket_name.c_str(ctx));
+                            }
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Implementation of the recursive cut-based spreading as described in the HeAP paper
+        // Note we use "left" to mean "-x/-y" depending on dir and "right" to mean "+x/+y" depending on dir
+
+        std::vector<CellInfo *> cut_cells;
+
+        boost::optional<std::pair<int, int>> cut_region(SpreaderRegion &r, bool dir)
+        {
+            cut_cells.clear();
+            auto &cal = cells_at_location;
+            int total_cells = 0, total_bels = 0;
+            for (int x = r.x0; x <= r.x1; x++) {
+                for (int y = r.y0; y <= r.y1; y++) {
+                    std::copy(cal.at(x).at(y).begin(), cal.at(x).at(y).end(), std::back_inserter(cut_cells));
+                    for (size_t t = 0; t < buckets.size(); t++)
+                        total_bels += bels_at(x, y, t);
+                }
+            }
+            for (auto &cell : cut_cells) {
+                total_cells += p->chain_size.count(cell->name) ? p->chain_size.at(cell->name) : 1;
+            }
+            std::sort(cut_cells.begin(), cut_cells.end(), [&](const CellInfo *a, const CellInfo *b) {
+                return dir ? (p->cell_locs.at(a->name).rawy < p->cell_locs.at(b->name).rawy)
+                           : (p->cell_locs.at(a->name).rawx < p->cell_locs.at(b->name).rawx);
+            });
+
+            if (cut_cells.size() < 2)
+                return {};
+            // Find the cells midpoint, counting chains in terms of their total size - making the initial source cut
+            int pivot_cells = 0;
+            int pivot = 0;
+            for (auto &cell : cut_cells) {
+                pivot_cells += p->chain_size.count(cell->name) ? p->chain_size.at(cell->name) : 1;
+                if (pivot_cells >= total_cells / 2)
+                    break;
+                pivot++;
+            }
+            if (pivot >= int(cut_cells.size())) {
+                pivot = int(cut_cells.size()) - 1;
+            }
+            // log_info("orig pivot %d/%d lc %d rc %d\n", pivot, int(cut_cells.size()), pivot_cells, total_cells -
+            // pivot_cells);
+
+            // Find the clearance required either side of the pivot
+            int clearance_l = 0, clearance_r = 0;
+            for (size_t i = 0; i < cut_cells.size(); i++) {
+                int size;
+                if (cell_extents.count(cut_cells.at(i)->name)) {
+                    auto &ce = cell_extents.at(cut_cells.at(i)->name);
+                    size = dir ? (ce.y1 - ce.y0 + 1) : (ce.x1 - ce.x0 + 1);
+                } else {
+                    size = 1;
+                }
+                if (int(i) < pivot)
+                    clearance_l = std::max(clearance_l, size);
+                else
+                    clearance_r = std::max(clearance_r, size);
+            }
+            // Find the target cut that minimises difference in utilisation, whilst trying to ensure that all chains
+            // still fit
+
+            // First trim the boundaries of the region in the axis-of-interest, skipping any rows/cols without any
+            // bels of the appropriate type
+            int trimmed_l = dir ? r.y0 : r.x0, trimmed_r = dir ? r.y1 : r.x1;
+            while (trimmed_l < (dir ? r.y1 : r.x1)) {
+                bool have_bels = false;
+                for (int i = dir ? r.x0 : r.y0; i <= (dir ? r.x1 : r.y1); i++) {
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        if (bels_at(dir ? i : trimmed_l, dir ? trimmed_l : i, t) > 0) {
+                            have_bels = true;
+                            break;
+                        }
+                    }
+                }
+
+                if (have_bels)
+                    break;
+
+                trimmed_l++;
+            }
+            while (trimmed_r > (dir ? r.y0 : r.x0)) {
+                bool have_bels = false;
+                for (int i = dir ? r.x0 : r.y0; i <= (dir ? r.x1 : r.y1); i++) {
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        if (bels_at(dir ? i : trimmed_r, dir ? trimmed_r : i, t) > 0) {
+                            have_bels = true;
+                            break;
+                        }
+                    }
+                }
+
+                if (have_bels)
+                    break;
+
+                trimmed_r--;
+            }
+            // log_info("tl %d tr %d cl %d cr %d\n", trimmed_l, trimmed_r, clearance_l, clearance_r);
+            if ((trimmed_r - trimmed_l + 1) <= std::max(clearance_l, clearance_r))
+                return {};
+            // Now find the initial target cut that minimises utilisation imbalance, whilst
+            // meeting the clearance requirements for any large macros
+            std::vector<int> left_cells_v(buckets.size(), 0), right_cells_v(buckets.size(), 0);
+            std::vector<int> left_bels_v(buckets.size(), 0), right_bels_v(r.bels);
+            for (int i = 0; i <= pivot; i++)
+                left_cells_v.at(cell_index(*cut_cells.at(i))) +=
+                        p->chain_size.count(cut_cells.at(i)->name) ? p->chain_size.at(cut_cells.at(i)->name) : 1;
+            for (int i = pivot + 1; i < int(cut_cells.size()); i++)
+                right_cells_v.at(cell_index(*cut_cells.at(i))) +=
+                        p->chain_size.count(cut_cells.at(i)->name) ? p->chain_size.at(cut_cells.at(i)->name) : 1;
+
+            int best_tgt_cut = -1;
+            double best_deltaU = std::numeric_limits<double>::max();
+            // std::pair<int, int> target_cut_bels;
+            std::vector<int> slither_bels(buckets.size(), 0);
+            for (int i = trimmed_l; i <= trimmed_r; i++) {
+                for (size_t t = 0; t < buckets.size(); t++)
+                    slither_bels.at(t) = 0;
+                for (int j = dir ? r.x0 : r.y0; j <= (dir ? r.x1 : r.y1); j++) {
+                    for (size_t t = 0; t < buckets.size(); t++)
+                        slither_bels.at(t) += dir ? bels_at(j, i, t) : bels_at(i, j, t);
+                }
+                for (size_t t = 0; t < buckets.size(); t++) {
+                    left_bels_v.at(t) += slither_bels.at(t);
+                    right_bels_v.at(t) -= slither_bels.at(t);
+                }
+
+                if (((i - trimmed_l) + 1) >= clearance_l && ((trimmed_r - i) + 1) >= clearance_r) {
+                    // Solution is potentially valid
+                    double aU = 0;
+                    for (size_t t = 0; t < buckets.size(); t++)
+                        aU += (left_cells_v.at(t) + right_cells_v.at(t)) *
+                              std::abs(double(left_cells_v.at(t)) / double(std::max(left_bels_v.at(t), 1)) -
+                                       double(right_cells_v.at(t)) / double(std::max(right_bels_v.at(t), 1)));
+                    if (aU < best_deltaU) {
+                        best_deltaU = aU;
+                        best_tgt_cut = i;
+                    }
+                }
+            }
+            if (best_tgt_cut == -1)
+                return {};
+            // left_bels = target_cut_bels.first;
+            // right_bels = target_cut_bels.second;
+            for (size_t t = 0; t < buckets.size(); t++) {
+                left_bels_v.at(t) = 0;
+                right_bels_v.at(t) = 0;
+            }
+            for (int x = r.x0; x <= (dir ? r.x1 : best_tgt_cut); x++)
+                for (int y = r.y0; y <= (dir ? best_tgt_cut : r.y1); y++) {
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        left_bels_v.at(t) += bels_at(x, y, t);
+                    }
+                }
+            for (int x = dir ? r.x0 : (best_tgt_cut + 1); x <= r.x1; x++)
+                for (int y = dir ? (best_tgt_cut + 1) : r.y0; y <= r.y1; y++) {
+                    for (size_t t = 0; t < buckets.size(); t++) {
+                        right_bels_v.at(t) += bels_at(x, y, t);
+                    }
+                }
+            if (std::accumulate(left_bels_v.begin(), left_bels_v.end(), 0) == 0 ||
+                std::accumulate(right_bels_v.begin(), right_bels_v.end(), 0) == 0)
+                return {};
+
+            // Perturb the source cut to eliminate overutilisation
+            auto is_part_overutil = [&](bool r) {
+                double delta = 0;
+                for (size_t t = 0; t < left_cells_v.size(); t++) {
+                    delta += double(left_cells_v.at(t)) / double(std::max(left_bels_v.at(t), 1)) -
+                             double(right_cells_v.at(t)) / double(std::max(right_bels_v.at(t), 1));
+                }
+                return r ? delta < 0 : delta > 0;
+            };
+            while (pivot > 0 && is_part_overutil(false)) {
+                auto &move_cell = cut_cells.at(pivot);
+                int size = p->chain_size.count(move_cell->name) ? p->chain_size.at(move_cell->name) : 1;
+                left_cells_v.at(cell_index(*cut_cells.at(pivot))) -= size;
+                right_cells_v.at(cell_index(*cut_cells.at(pivot))) += size;
+                pivot--;
+            }
+            while (pivot < int(cut_cells.size()) - 1 && is_part_overutil(true)) {
+                auto &move_cell = cut_cells.at(pivot + 1);
+                int size = p->chain_size.count(move_cell->name) ? p->chain_size.at(move_cell->name) : 1;
+                left_cells_v.at(cell_index(*cut_cells.at(pivot))) += size;
+                right_cells_v.at(cell_index(*cut_cells.at(pivot))) -= size;
+                pivot++;
+            }
+
+            // Split regions into bins, and then spread cells by linear interpolation within those bins
+            auto spread_binlerp = [&](int cells_start, int cells_end, double area_l, double area_r) {
+                int N = cells_end - cells_start;
+                if (N <= 2) {
+                    for (int i = cells_start; i < cells_end; i++) {
+                        auto &pos = dir ? p->cell_locs.at(cut_cells.at(i)->name).rawy
+                                        : p->cell_locs.at(cut_cells.at(i)->name).rawx;
+                        pos = area_l + i * ((area_r - area_l) / N);
+                    }
+                    return;
+                }
+                // Split region into up to 10 (K) bins
+                int K = std::min<int>(N, 10);
+                std::vector<std::pair<int, double>> bin_bounds; // [(cell start, area start)]
+                bin_bounds.emplace_back(cells_start, area_l);
+                for (int i = 1; i < K; i++)
+                    bin_bounds.emplace_back(cells_start + (N * i) / K, area_l + ((area_r - area_l + 0.99) * i) / K);
+                bin_bounds.emplace_back(cells_end, area_r + 0.99);
+                for (int i = 0; i < K; i++) {
+                    auto &bl = bin_bounds.at(i), br = bin_bounds.at(i + 1);
+                    double orig_left = dir ? p->cell_locs.at(cut_cells.at(bl.first)->name).rawy
+                                           : p->cell_locs.at(cut_cells.at(bl.first)->name).rawx;
+                    double orig_right = dir ? p->cell_locs.at(cut_cells.at(br.first - 1)->name).rawy
+                                            : p->cell_locs.at(cut_cells.at(br.first - 1)->name).rawx;
+                    double m = (br.second - bl.second) / std::max(0.00001, orig_right - orig_left);
+                    for (int j = bl.first; j < br.first; j++) {
+                        Region *cr = cut_cells.at(j)->region;
+                        if (cr != nullptr) {
+                            // Limit spreading bounds to constraint region; if applicable
+                            double brsc = p->limit_to_reg(cr, br.second, dir);
+                            double blsc = p->limit_to_reg(cr, bl.second, dir);
+                            double mr = (brsc - blsc) / std::max(0.00001, orig_right - orig_left);
+                            auto &pos = dir ? p->cell_locs.at(cut_cells.at(j)->name).rawy
+                                            : p->cell_locs.at(cut_cells.at(j)->name).rawx;
+                            NPNR_ASSERT(pos >= orig_left && pos <= orig_right);
+                            pos = blsc + mr * (pos - orig_left);
+                        } else {
+                            auto &pos = dir ? p->cell_locs.at(cut_cells.at(j)->name).rawy
+                                            : p->cell_locs.at(cut_cells.at(j)->name).rawx;
+                            NPNR_ASSERT(pos >= orig_left && pos <= orig_right);
+                            pos = bl.second + m * (pos - orig_left);
+                        }
+                    }
+                }
+            };
+            spread_binlerp(0, pivot + 1, trimmed_l, best_tgt_cut);
+            spread_binlerp(pivot + 1, int(cut_cells.size()), best_tgt_cut + 1, trimmed_r);
+            // Update various data structures
+            for (int x = r.x0; x <= r.x1; x++)
+                for (int y = r.y0; y <= r.y1; y++) {
+                    cells_at_location.at(x).at(y).clear();
+                }
+            for (auto cell : cut_cells) {
+                auto &cl = p->cell_locs.at(cell->name);
+                cl.x = std::min(r.x1, std::max(r.x0, int(cl.rawx)));
+                cl.y = std::min(r.y1, std::max(r.y0, int(cl.rawy)));
+                cells_at_location.at(cl.x).at(cl.y).push_back(cell);
+            }
+            SpreaderRegion rl, rr;
+            rl.id = int(regions.size());
+            rl.x0 = r.x0;
+            rl.y0 = r.y0;
+            rl.x1 = dir ? r.x1 : best_tgt_cut;
+            rl.y1 = dir ? best_tgt_cut : r.y1;
+            rl.cells = left_cells_v;
+            rl.bels = left_bels_v;
+            rr.id = int(regions.size()) + 1;
+            rr.x0 = dir ? r.x0 : (best_tgt_cut + 1);
+            rr.y0 = dir ? (best_tgt_cut + 1) : r.y0;
+            rr.x1 = r.x1;
+            rr.y1 = r.y1;
+            rr.cells = right_cells_v;
+            rr.bels = right_bels_v;
+            regions.push_back(rl);
+            regions.push_back(rr);
+            for (int x = rl.x0; x <= rl.x1; x++)
+                for (int y = rl.y0; y <= rl.y1; y++)
+                    groups.at(x).at(y) = rl.id;
+            for (int x = rr.x0; x <= rr.x1; x++)
+                for (int y = rr.y0; y <= rr.y1; y++)
+                    groups.at(x).at(y) = rr.id;
+            return std::make_pair(rl.id, rr.id);
+        };
+    };
+    typedef decltype(CellInfo::udata) cell_udata_t;
+    cell_udata_t dont_solve = std::numeric_limits<cell_udata_t>::max();
+};
+int HeAPPlacer::CutSpreader::seq = 0;
+
+bool placer_heap(Context *ctx, PlacerHeapCfg cfg) { return HeAPPlacer(ctx, cfg).place(); }
+
+PlacerHeapCfg::PlacerHeapCfg(Context *ctx)
+{
+    alpha = ctx->setting<float>("placerHeap/alpha");
+    beta = ctx->setting<float>("placerHeap/beta");
+    criticalityExponent = ctx->setting<int>("placerHeap/criticalityExponent");
+    timingWeight = ctx->setting<int>("placerHeap/timingWeight");
+    parallelRefine = ctx->setting<bool>("placerHeap/parallelRefine", false);
+
+    timing_driven = ctx->setting<bool>("timing_driven");
+    solverTolerance = 1e-5;
+    placeAllAtOnce = false;
+
+    hpwl_scale_x = 1;
+    hpwl_scale_y = 1;
+    spread_scale_x = 1;
+    spread_scale_y = 1;
+}
+
+NEXTPNR_NAMESPACE_END
+
+#else
+
+#include "log.h"
+#include "nextpnr.h"
+#include "placer_heap.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+bool placer_heap(Context *ctx, PlacerHeapCfg cfg)
+{
+    log_error("nextpnr was built without the HeAP placer\n");
+    return false;
+}
+
+PlacerHeapCfg::PlacerHeapCfg(Context *ctx) {}
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/place/placer_heap.h b/common/place/placer_heap.h
new file mode 100644
index 00000000..9c62869e
--- /dev/null
+++ b/common/place/placer_heap.h
@@ -0,0 +1,58 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2019  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ *  [[cite]] HeAP
+ *  Analytical Placement for Heterogeneous FPGAs, Marcel Gort and Jason H. Anderson
+ *  https://janders.eecg.utoronto.ca/pdfs/marcelfpl12.pdf
+ *
+ *  [[cite]] SimPL
+ *  SimPL: An Effective Placement Algorithm, Myung-Chul Kim, Dong-Jin Lee and Igor L. Markov
+ *  http://www.ece.umich.edu/cse/awards/pdfs/iccad10-simpl.pdf
+ */
+
+#ifndef PLACER_HEAP_H
+#define PLACER_HEAP_H
+#include "log.h"
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct PlacerHeapCfg
+{
+    PlacerHeapCfg(Context *ctx);
+
+    float alpha, beta;
+    float criticalityExponent;
+    float timingWeight;
+    bool timing_driven;
+    float solverTolerance;
+    bool placeAllAtOnce;
+    bool parallelRefine;
+
+    int hpwl_scale_x, hpwl_scale_y;
+    int spread_scale_x, spread_scale_y;
+
+    // These cell types will be randomly locked to prevent singular matrices
+    pool<IdString> ioBufTypes;
+    // These cell types are part of the same unit (e.g. slices split into
+    // components) so will always be spread together
+    std::vector<pool<BelBucketId>> cellGroups;
+};
+
+extern bool placer_heap(Context *ctx, PlacerHeapCfg cfg);
+NEXTPNR_NAMESPACE_END
+#endif
diff --git a/common/place/timing_opt.cc b/common/place/timing_opt.cc
new file mode 100644
index 00000000..f9246292
--- /dev/null
+++ b/common/place/timing_opt.cc
@@ -0,0 +1,561 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+/*
+ * Timing-optimised detailed placement algorithm using BFS of the neighbour graph created from cells
+ * on a critical path
+ *
+ * Based on "An Effective Timing-Driven Detailed Placement Algorithm for FPGAs"
+ * https://www.cerc.utexas.edu/utda/publications/C205.pdf
+ *
+ * Modifications made to deal with the smaller Bels that nextpnr uses instead of swapping whole tiles,
+ * and deal with the fact that not every cell on the crit path may be swappable.
+ */
+
+#include "timing_opt.h"
+#include <boost/range/adaptor/reversed.hpp>
+#include <queue>
+#include "nextpnr.h"
+#include "timing.h"
+#include "util.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+class TimingOptimiser
+{
+  public:
+    TimingOptimiser(Context *ctx, TimingOptCfg cfg) : ctx(ctx), cfg(cfg), tmg(ctx){};
+    bool optimise()
+    {
+        log_info("Running timing-driven placement optimisation...\n");
+        ctx->lock();
+        if (ctx->verbose)
+            timing_analysis(ctx, false, true, false, false);
+        tmg.setup();
+        for (int i = 0; i < 30; i++) {
+            log_info("   Iteration %d...\n", i);
+            tmg.run();
+            setup_delay_limits();
+            auto crit_paths = find_crit_paths(0.98, 50000);
+            for (auto &path : crit_paths)
+                optimise_path(path);
+            if (ctx->verbose)
+                timing_analysis(ctx, false, true, false, false);
+        }
+        ctx->unlock();
+        return true;
+    }
+
+  private:
+    void setup_delay_limits()
+    {
+        max_net_delay.clear();
+        for (auto &net : ctx->nets) {
+            NetInfo *ni = net.second.get();
+            if (ni->driver.cell == nullptr)
+                continue;
+            for (auto usr : ni->users) {
+                max_net_delay[std::make_pair(usr.cell->name, usr.port)] = std::numeric_limits<delay_t>::max();
+            }
+            for (auto usr : ni->users) {
+                delay_t net_delay = ctx->getNetinfoRouteDelay(ni, usr);
+                delay_t slack = tmg.get_setup_slack(CellPortKey(usr));
+                delay_t domain_slack = tmg.get_domain_setup_slack(CellPortKey(usr));
+                if (slack == std::numeric_limits<delay_t>::max())
+                    continue;
+                max_net_delay[std::make_pair(usr.cell->name, usr.port)] = net_delay + ((slack - domain_slack) / 10);
+            }
+        }
+    }
+
+    bool check_cell_delay_limits(CellInfo *cell)
+    {
+        for (const auto &port : cell->ports) {
+            int nc;
+            if (ctx->getPortTimingClass(cell, port.first, nc) == TMG_IGNORE)
+                continue;
+            NetInfo *net = port.second.net;
+            if (net == nullptr)
+                continue;
+            if (port.second.type == PORT_IN) {
+                if (net->driver.cell == nullptr || net->driver.cell->bel == BelId())
+                    continue;
+                for (auto user : net->users) {
+                    if (user.cell == cell && user.port == port.first) {
+                        if (ctx->predictArcDelay(net, user) >
+                            1.1 * max_net_delay.at(std::make_pair(cell->name, port.first)))
+                            return false;
+                    }
+                }
+
+            } else if (port.second.type == PORT_OUT) {
+                for (auto user : net->users) {
+                    // This could get expensive for high-fanout nets??
+                    BelId dstBel = user.cell->bel;
+                    if (dstBel == BelId())
+                        continue;
+                    if (ctx->predictArcDelay(net, user) >
+                        1.1 * max_net_delay.at(std::make_pair(user.cell->name, user.port))) {
+
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    BelId cell_swap_bel(CellInfo *cell, BelId newBel)
+    {
+        BelId oldBel = cell->bel;
+        if (oldBel == newBel)
+            return oldBel;
+        CellInfo *other_cell = ctx->getBoundBelCell(newBel);
+        NPNR_ASSERT(other_cell == nullptr || other_cell->belStrength <= STRENGTH_WEAK);
+        ctx->unbindBel(oldBel);
+        if (other_cell != nullptr) {
+            ctx->unbindBel(newBel);
+            ctx->bindBel(oldBel, other_cell, STRENGTH_WEAK);
+        }
+        ctx->bindBel(newBel, cell, STRENGTH_WEAK);
+        return oldBel;
+    }
+
+    // Check that a series of moves are both legal and remain within maximum delay bounds
+    // Moves are specified as a vector of pairs <cell, oldBel>
+    bool acceptable_move(std::vector<std::pair<CellInfo *, BelId>> &move, bool check_delays = true)
+    {
+        for (auto &entry : move) {
+            if (!ctx->isBelLocationValid(entry.first->bel))
+                return false;
+            if (!ctx->isBelLocationValid(entry.second))
+                return false;
+            if (!check_delays)
+                continue;
+            if (!check_cell_delay_limits(entry.first))
+                return false;
+            // We might have swapped another cell onto the original bel. Check this for max delay violations
+            // too
+            CellInfo *swapped = ctx->getBoundBelCell(entry.second);
+            if (swapped != nullptr && !check_cell_delay_limits(swapped))
+                return false;
+        }
+        return true;
+    }
+
+    int find_neighbours(CellInfo *cell, IdString prev_cell, int d, bool allow_swap)
+    {
+        BelId curr = cell->bel;
+        Loc curr_loc = ctx->getBelLocation(curr);
+        int found_count = 0;
+        cell_neighbour_bels[cell->name] = pool<BelId>{};
+        for (int dy = -d; dy <= d; dy++) {
+            for (int dx = -d; dx <= d; dx++) {
+                // Go through all the Bels at this location
+                // First, find all bels of the correct type that are either unbound or bound normally
+                // Strongly bound bels are ignored
+                // FIXME: This means that we cannot touch carry chains or similar relatively constrained macros
+                std::vector<BelId> free_bels_at_loc;
+                std::vector<BelId> bound_bels_at_loc;
+                for (auto bel : ctx->getBelsByTile(curr_loc.x + dx, curr_loc.y + dy)) {
+                    if (!ctx->isValidBelForCellType(cell->type, bel))
+                        continue;
+                    CellInfo *bound = ctx->getBoundBelCell(bel);
+                    if (bound == nullptr) {
+                        free_bels_at_loc.push_back(bel);
+                    } else if (bound->belStrength <= STRENGTH_WEAK && bound->cluster == ClusterId()) {
+                        bound_bels_at_loc.push_back(bel);
+                    }
+                }
+                BelId candidate;
+
+                while (!free_bels_at_loc.empty() || !bound_bels_at_loc.empty()) {
+                    BelId try_bel;
+                    if (!free_bels_at_loc.empty()) {
+                        int try_idx = ctx->rng(int(free_bels_at_loc.size()));
+                        try_bel = free_bels_at_loc.at(try_idx);
+                        free_bels_at_loc.erase(free_bels_at_loc.begin() + try_idx);
+                    } else {
+                        int try_idx = ctx->rng(int(bound_bels_at_loc.size()));
+                        try_bel = bound_bels_at_loc.at(try_idx);
+                        bound_bels_at_loc.erase(bound_bels_at_loc.begin() + try_idx);
+                    }
+                    if (bel_candidate_cells.count(try_bel) && !allow_swap) {
+                        // Overlap is only allowed if it is with the previous cell (this is handled by removing those
+                        // edges in the graph), or if allow_swap is true to deal with cases where overlap means few
+                        // neighbours are identified
+                        if (bel_candidate_cells.at(try_bel).size() > 1 ||
+                            (bel_candidate_cells.at(try_bel).size() == 1 &&
+                             *(bel_candidate_cells.at(try_bel).begin()) != prev_cell))
+                            continue;
+                    }
+                    // TODO: what else to check here?
+                    candidate = try_bel;
+                    break;
+                }
+
+                if (candidate != BelId()) {
+                    cell_neighbour_bels[cell->name].insert(candidate);
+                    bel_candidate_cells[candidate].insert(cell->name);
+                    // Work out if we need to delete any overlap
+                    std::vector<IdString> overlap;
+                    for (auto other : bel_candidate_cells[candidate])
+                        if (other != cell->name && other != prev_cell)
+                            overlap.push_back(other);
+                    if (overlap.size() > 0)
+                        NPNR_ASSERT(allow_swap);
+                    for (auto ov : overlap) {
+                        bel_candidate_cells[candidate].erase(ov);
+                        cell_neighbour_bels[ov].erase(candidate);
+                    }
+                }
+            }
+        }
+        return found_count;
+    }
+
+    std::vector<std::vector<PortRef *>> find_crit_paths(float crit_thresh, size_t max_count)
+    {
+        std::vector<std::vector<PortRef *>> crit_paths;
+        std::vector<std::pair<NetInfo *, store_index<PortRef>>> crit_nets;
+        std::vector<IdString> netnames;
+        std::transform(ctx->nets.begin(), ctx->nets.end(), std::back_inserter(netnames),
+                       [](const std::pair<IdString, std::unique_ptr<NetInfo>> &kv) { return kv.first; });
+        ctx->sorted_shuffle(netnames);
+        for (auto net : netnames) {
+            if (crit_nets.size() >= max_count)
+                break;
+            float highest_crit = 0;
+            store_index<PortRef> crit_user_idx{};
+            NetInfo *ni = ctx->nets.at(net).get();
+            for (auto usr : ni->users.enumerate()) {
+                float crit = tmg.get_criticality(CellPortKey(usr.value));
+                if (crit > highest_crit) {
+                    highest_crit = crit;
+                    crit_user_idx = usr.index;
+                }
+            }
+            if (highest_crit > crit_thresh)
+                crit_nets.emplace_back(ni, crit_user_idx);
+        }
+
+        pool<PortRef *, hash_ptr_ops> used_ports;
+
+        for (auto crit_net : crit_nets) {
+
+            if (used_ports.count(&(crit_net.first->users.at(crit_net.second))))
+                continue;
+
+            std::deque<PortRef *> crit_path;
+
+            // FIXME: This will fail badly on combinational loops
+
+            // Iterate backwards following greatest criticality
+            NetInfo *back_cursor = crit_net.first;
+            while (back_cursor != nullptr) {
+                float max_crit = 0;
+                std::pair<NetInfo *, store_index<PortRef>> crit_sink{nullptr, {}};
+                CellInfo *cell = back_cursor->driver.cell;
+                if (cell == nullptr)
+                    break;
+                for (auto port : cell->ports) {
+                    if (port.second.type != PORT_IN)
+                        continue;
+                    NetInfo *pn = port.second.net;
+                    if (pn == nullptr)
+                        continue;
+                    int ccount;
+                    DelayQuad combDelay;
+                    TimingPortClass tpclass = ctx->getPortTimingClass(cell, port.first, ccount);
+                    if (tpclass != TMG_COMB_INPUT)
+                        continue;
+                    bool is_path = ctx->getCellDelay(cell, port.first, back_cursor->driver.port, combDelay);
+                    if (!is_path)
+                        continue;
+                    float usr_crit = tmg.get_criticality(CellPortKey(cell->name, port.first));
+                    if (used_ports.count(&(pn->users.at(port.second.user_idx))))
+                        continue;
+                    if (usr_crit >= max_crit) {
+                        max_crit = usr_crit;
+                        crit_sink = std::make_pair(pn, port.second.user_idx);
+                    }
+                }
+
+                if (crit_sink.first != nullptr) {
+                    crit_path.push_front(&(crit_sink.first->users.at(crit_sink.second)));
+                    used_ports.insert(&(crit_sink.first->users.at(crit_sink.second)));
+                }
+                back_cursor = crit_sink.first;
+            }
+            // Iterate forwards following greatest criticiality
+            PortRef *fwd_cursor = &(crit_net.first->users.at(crit_net.second));
+            while (fwd_cursor != nullptr) {
+                crit_path.push_back(fwd_cursor);
+                float max_crit = 0;
+                std::pair<NetInfo *, store_index<PortRef>> crit_sink{nullptr, {}};
+                CellInfo *cell = fwd_cursor->cell;
+                for (auto port : cell->ports) {
+                    if (port.second.type != PORT_OUT)
+                        continue;
+                    NetInfo *pn = port.second.net;
+                    if (pn == nullptr)
+                        continue;
+
+                    int ccount;
+                    DelayQuad combDelay;
+                    TimingPortClass tpclass = ctx->getPortTimingClass(cell, port.first, ccount);
+                    if (tpclass != TMG_COMB_OUTPUT && tpclass != TMG_REGISTER_OUTPUT)
+                        continue;
+                    bool is_path = ctx->getCellDelay(cell, fwd_cursor->port, port.first, combDelay);
+                    if (!is_path)
+                        continue;
+                    for (auto usr : pn->users.enumerate()) {
+                        if (used_ports.count(&(pn->users.at(usr.index))))
+                            continue;
+                        float crit = tmg.get_criticality(CellPortKey(usr.value));
+                        if (crit >= max_crit) {
+                            max_crit = crit;
+                            crit_sink = std::make_pair(pn, usr.index);
+                        }
+                    }
+                }
+                if (crit_sink.first != nullptr) {
+                    fwd_cursor = &(crit_sink.first->users.at(crit_sink.second));
+                    used_ports.insert(&(crit_sink.first->users.at(crit_sink.second)));
+                } else {
+                    fwd_cursor = nullptr;
+                }
+            }
+
+            std::vector<PortRef *> crit_path_vec;
+            std::copy(crit_path.begin(), crit_path.end(), std::back_inserter(crit_path_vec));
+            crit_paths.push_back(crit_path_vec);
+        }
+
+        return crit_paths;
+    }
+
+    void optimise_path(std::vector<PortRef *> &path)
+    {
+        path_cells.clear();
+        cell_neighbour_bels.clear();
+        bel_candidate_cells.clear();
+        if (ctx->debug)
+            log_info("Optimising the following path: \n");
+
+        auto front_port = path.front();
+        NetInfo *front_net = front_port->cell->ports.at(front_port->port).net;
+        if (front_net != nullptr && front_net->driver.cell != nullptr) {
+            auto front_cell = front_net->driver.cell;
+            if (front_cell->belStrength <= STRENGTH_WEAK && cfg.cellTypes.count(front_cell->type) &&
+                front_cell->cluster == ClusterId()) {
+                path_cells.push_back(front_cell->name);
+            }
+        }
+
+        for (auto port : path) {
+            if (ctx->debug) {
+                float crit = tmg.get_criticality(CellPortKey(*port));
+                log_info("    %s.%s at %s crit %0.02f\n", port->cell->name.c_str(ctx), port->port.c_str(ctx),
+                         ctx->nameOfBel(port->cell->bel), crit);
+            }
+            if (std::find(path_cells.begin(), path_cells.end(), port->cell->name) != path_cells.end())
+                continue;
+            if (port->cell->belStrength > STRENGTH_WEAK || !cfg.cellTypes.count(port->cell->type) ||
+                port->cell->cluster != ClusterId())
+                continue;
+            if (ctx->debug)
+                log_info("        can move\n");
+            path_cells.push_back(port->cell->name);
+        }
+
+        if (path_cells.size() < 2) {
+            if (ctx->debug) {
+                log_info("Too few moveable cells; skipping path\n");
+                log_break();
+            }
+
+            return;
+        }
+
+        // Calculate original delay before touching anything
+        delay_t original_delay = 0;
+
+        for (size_t i = 0; i < path.size(); i++) {
+            auto &port = path.at(i)->cell->ports.at(path.at(i)->port);
+            NetInfo *pn = port.net;
+            if (port.user_idx)
+                original_delay += ctx->predictArcDelay(pn, pn->users.at(port.user_idx));
+        }
+
+        IdString last_cell;
+        const int d = 2; // FIXME: how to best determine d
+        for (auto cell : path_cells) {
+            // FIXME: when should we allow swapping due to a lack of candidates
+            find_neighbours(ctx->cells[cell].get(), last_cell, d, false);
+            last_cell = cell;
+        }
+
+        if (ctx->debug) {
+            for (auto cell : path_cells) {
+                log_info("Candidate neighbours for %s (%s):\n", cell.c_str(ctx), ctx->nameOfBel(ctx->cells[cell]->bel));
+                for (auto neigh : cell_neighbour_bels.at(cell)) {
+                    log_info("    %s\n", ctx->nameOfBel(neigh));
+                }
+            }
+        }
+
+        // Actual BFS path optimisation algorithm
+        dict<IdString, dict<BelId, delay_t>> cumul_costs;
+        dict<std::pair<IdString, BelId>, std::pair<IdString, BelId>> backtrace;
+        std::queue<std::pair<int, BelId>> visit;
+        pool<std::pair<int, BelId>> to_visit;
+
+        for (auto startbel : cell_neighbour_bels[path_cells.front()]) {
+            // Swap for legality check
+            CellInfo *cell = ctx->cells.at(path_cells.front()).get();
+            BelId origBel = cell_swap_bel(cell, startbel);
+            std::vector<std::pair<CellInfo *, BelId>> move{std::make_pair(cell, origBel)};
+            if (acceptable_move(move)) {
+                auto entry = std::make_pair(0, startbel);
+                visit.push(entry);
+                cumul_costs[path_cells.front()][startbel] = 0;
+            }
+            // Swap back
+            cell_swap_bel(cell, origBel);
+        }
+
+        while (!visit.empty()) {
+            auto entry = visit.front();
+            visit.pop();
+            auto cellname = path_cells.at(entry.first);
+            if (entry.first == int(path_cells.size()) - 1)
+                continue;
+            std::vector<std::pair<CellInfo *, BelId>> move;
+            // Apply the entire backtrace for accurate legality and delay checks
+            // This is probably pretty expensive (but also probably pales in comparison to the number of swaps
+            // SA will make...)
+            std::vector<std::pair<IdString, BelId>> route_to_entry;
+            auto cursor = std::make_pair(cellname, entry.second);
+            route_to_entry.push_back(cursor);
+            while (backtrace.count(cursor)) {
+                cursor = backtrace.at(cursor);
+                route_to_entry.push_back(cursor);
+            }
+            for (auto rt_entry : boost::adaptors::reverse(route_to_entry)) {
+                CellInfo *cell = ctx->cells.at(rt_entry.first).get();
+                BelId origBel = cell_swap_bel(cell, rt_entry.second);
+                move.push_back(std::make_pair(cell, origBel));
+            }
+
+            // Have a look at where we can travel from here
+            for (auto neighbour : cell_neighbour_bels.at(path_cells.at(entry.first + 1))) {
+                // Edges between overlapping bels are deleted
+                if (neighbour == entry.second)
+                    continue;
+                // Experimentally swap the next path cell onto the neighbour bel we are trying
+                IdString ncname = path_cells.at(entry.first + 1);
+                CellInfo *next_cell = ctx->cells.at(ncname).get();
+                BelId origBel = cell_swap_bel(next_cell, neighbour);
+                move.push_back(std::make_pair(next_cell, origBel));
+
+                delay_t total_delay = 0;
+
+                for (size_t i = 0; i < path.size(); i++) {
+                    auto &port = path.at(i)->cell->ports.at(path.at(i)->port);
+                    NetInfo *pn = port.net;
+                    if (port.user_idx)
+                        total_delay += ctx->predictArcDelay(pn, pn->users.at(port.user_idx));
+                    if (path.at(i)->cell == next_cell)
+                        break;
+                }
+
+                // First, check if the move is actually worthwhile from a delay point of view before the expensive
+                // legality check
+                if (!cumul_costs.count(ncname) || !cumul_costs.at(ncname).count(neighbour) ||
+                    cumul_costs.at(ncname).at(neighbour) > total_delay) {
+                    // Now check that the swaps we have made to get here are legal and meet max delay requirements
+                    if (acceptable_move(move)) {
+                        cumul_costs[ncname][neighbour] = total_delay;
+                        backtrace[std::make_pair(ncname, neighbour)] = std::make_pair(cellname, entry.second);
+                        if (!to_visit.count(std::make_pair(entry.first + 1, neighbour)))
+                            visit.push(std::make_pair(entry.first + 1, neighbour));
+                    }
+                }
+                // Revert the experimental swap
+                cell_swap_bel(move.back().first, move.back().second);
+                move.pop_back();
+            }
+
+            // Revert move by swapping cells back to their original order
+            // Execute swaps in reverse order to how we made them originally
+            for (auto move_entry : boost::adaptors::reverse(move)) {
+                cell_swap_bel(move_entry.first, move_entry.second);
+            }
+        }
+
+        // Did we find a solution??
+        if (cumul_costs.count(path_cells.back())) {
+            // Find the end position with the lowest total delay
+            auto &end_options = cumul_costs.at(path_cells.back());
+            auto lowest = std::min_element(end_options.begin(), end_options.end(),
+                                           [](const std::pair<BelId, delay_t> &a, const std::pair<BelId, delay_t> &b) {
+                                               return a.second < b.second;
+                                           });
+            NPNR_ASSERT(lowest != end_options.end());
+
+            std::vector<std::pair<IdString, BelId>> route_to_solution;
+            auto cursor = std::make_pair(path_cells.back(), lowest->first);
+            route_to_solution.push_back(cursor);
+            while (backtrace.count(cursor)) {
+                cursor = backtrace.at(cursor);
+                route_to_solution.push_back(cursor);
+            }
+            if (ctx->debug)
+                log_info("Found a solution with cost %.02f ns (existing path %.02f ns)\n",
+                         ctx->getDelayNS(lowest->second), ctx->getDelayNS(original_delay));
+            for (auto rt_entry : boost::adaptors::reverse(route_to_solution)) {
+                CellInfo *cell = ctx->cells.at(rt_entry.first).get();
+                cell_swap_bel(cell, rt_entry.second);
+                if (ctx->debug)
+                    log_info("    %s at %s\n", rt_entry.first.c_str(ctx), ctx->nameOfBel(rt_entry.second));
+            }
+
+        } else {
+            if (ctx->debug)
+                log_info("Solution was not found\n");
+        }
+        if (ctx->debug)
+            log_break();
+    }
+
+    // Current candidate Bels for cells (linked in both direction>
+    std::vector<IdString> path_cells;
+    dict<IdString, pool<BelId>> cell_neighbour_bels;
+    dict<BelId, pool<IdString>> bel_candidate_cells;
+    // Map cell ports to net delay limit
+    dict<std::pair<IdString, IdString>, delay_t> max_net_delay;
+    Context *ctx;
+    TimingOptCfg cfg;
+    TimingAnalyser tmg;
+};
+
+bool timing_opt(Context *ctx, TimingOptCfg cfg) { return TimingOptimiser(ctx, cfg).optimise(); }
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/timing_opt.h b/common/place/timing_opt.h
new file mode 100644
index 00000000..8f8bc709
--- /dev/null
+++ b/common/place/timing_opt.h
@@ -0,0 +1,37 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2018  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "log.h"
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct TimingOptCfg
+{
+    TimingOptCfg(Context *ctx) {}
+
+    // The timing optimiser will *only* optimise cells of these types
+    // Normally these would only be logic cells (or tiles if applicable), the algorithm makes little sense
+    // for other cell types
+    pool<IdString> cellTypes;
+};
+
+extern bool timing_opt(Context *ctx, TimingOptCfg cfg);
+
+NEXTPNR_NAMESPACE_END
author	gatecat <gatecat@ds0.me>	2022-04-08 13:42:54 +0100
committer	gatecat <gatecat@ds0.me>	2022-04-08 13:42:54 +0100
commit	49f178ed94b5fad00d25dbd12adea0bf4732f803 (patch)
tree	ea642e20bc07441a800944390e1f904e6ce5b113 /common/place
parent	e42e22575f20b59634f88b5cf694efdb413ff0a0 (diff)
download	nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.tar.gz nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.tar.bz2 nextpnr-49f178ed94b5fad00d25dbd12adea0bf4732f803.zip