diff options
author | gatecat <gatecat@ds0.me> | 2022-03-03 21:53:26 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-03-03 21:53:26 +0000 |
commit | 2c6ca4836fd30faa5cdbd931a352866e5a04104d (patch) | |
tree | 50c2c44d8ee36cfbcb765b04912ee8fb678248d9 | |
parent | 0a70b9c992c06a7553725b3742052eb95abd5f20 (diff) | |
parent | cc9f99a80c9aa9b69bc2a54060b5fbcec799f145 (diff) | |
download | nextpnr-2c6ca4836fd30faa5cdbd931a352866e5a04104d.tar.gz nextpnr-2c6ca4836fd30faa5cdbd931a352866e5a04104d.tar.bz2 nextpnr-2c6ca4836fd30faa5cdbd931a352866e5a04104d.zip |
Merge pull request #929 from gatecat/gatecat/parallel_refine
parallel_refine: New, parallelised placement refinement pass
-rw-r--r-- | common/command.cc | 11 | ||||
-rw-r--r-- | common/parallel_refine.cc | 946 | ||||
-rw-r--r-- | common/parallel_refine.h | 43 | ||||
-rw-r--r-- | common/placer_heap.cc | 13 | ||||
-rw-r--r-- | common/placer_heap.h | 1 |
5 files changed, 1012 insertions, 2 deletions
diff --git a/common/command.cc b/common/command.cc index d4279a58..56327847 100644 --- a/common/command.cc +++ b/common/command.cc @@ -109,6 +109,7 @@ po::options_description CommandHandler::getGeneralOptions() general.add_options()("debug", "debug output"); general.add_options()("debug-placer", "debug output from placer only"); general.add_options()("debug-router", "debug output from router only"); + general.add_options()("threads", po::value<int>(), "number of threads for passes where this is configurable"); general.add_options()("force,f", "keep running after errors"); #ifndef NO_GUI @@ -173,6 +174,8 @@ po::options_description CommandHandler::getGeneralOptions() "placer heap criticality exponent (int, default: 2)"); general.add_options()("placer-heap-timingweight", po::value<int>(), "placer heap timing weight (int, default: 10)"); + general.add_options()("parallel-refine", "use new experimental parallelised engine for placement refinement"); + general.add_options()("router2-heatmap", po::value<std::string>(), "prefix for router2 resource congestion heatmaps"); @@ -225,6 +228,10 @@ void CommandHandler::setupContext(Context *ctx) ctx->rngseed(vm["seed"].as<int>()); } + if (vm.count("threads")) { + ctx->settings[ctx->id("threads")] = vm["threads"].as<int>(); + } + if (vm.count("randomize-seed")) { std::random_device randDev{}; std::uniform_int_distribution<int> distrib{1}; @@ -298,6 +305,10 @@ void CommandHandler::setupContext(Context *ctx) if (vm.count("placer-heap-timingweight")) ctx->settings[ctx->id("placerHeap/timingWeight")] = std::to_string(vm["placer-heap-timingweight"].as<int>()); + + if (vm.count("parallel-refine")) + ctx->settings[ctx->id("placerHeap/parallelRefine")] = true; + if (vm.count("router2-heatmap")) ctx->settings[ctx->id("router2/heatmap")] = vm["router2-heatmap"].as<std::string>(); if (vm.count("tmg-ripup") || vm.count("router2-tmg-ripup")) diff --git a/common/parallel_refine.cc b/common/parallel_refine.cc new file mode 100644 index 00000000..ea97b852 --- /dev/null +++ b/common/parallel_refine.cc @@ -0,0 +1,946 @@ +/* + * nextpnr -- Next Generation Place and Route + * + * Copyright (C) 2021-22 gatecat <gatecat@ds0.me> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include "parallel_refine.h" +#include "fast_bels.h" +#include "log.h" +#include "timing.h" +#include "scope_lock.h" + +#include <chrono> +#include <mutex> +#include <queue> +#include <shared_mutex> +#include <thread> + +NEXTPNR_NAMESPACE_BEGIN + +namespace { +struct Partition +{ + int x0, y0, x1, y1; + std::vector<CellInfo *> cells; + Partition() = default; + explicit Partition(Context *ctx) + { + x0 = ctx->getGridDimX(); + y0 = ctx->getGridDimY(); + x1 = 0; + y1 = 0; + for (auto &cell : ctx->cells) { + Loc l = ctx->getBelLocation(cell.second->bel); + x0 = std::min(x0, l.x); + x1 = std::max(x1, l.x); + y0 = std::min(y0, l.y); + y1 = std::max(y1, l.y); + cells.push_back(cell.second.get()); + } + } + void split(Context *ctx, bool yaxis, float pivot, Partition &l, Partition &r) + { + std::sort(cells.begin(), cells.end(), [&](CellInfo *a, CellInfo *b) { + Loc l0 = ctx->getBelLocation(a->bel), l1 = ctx->getBelLocation(b->bel); + return yaxis ? (l0.y < l1.y) : (l0.x < l1.x); + }); + size_t pivot_point = size_t(cells.size() * pivot); + l.cells.clear(); + r.cells.clear(); + l.cells.reserve(pivot_point); + r.cells.reserve(cells.size() - pivot_point); + int pivot_coord = (pivot_point == 0) ? (yaxis ? y1 : x1) + : (yaxis ? ctx->getBelLocation(cells.at(pivot_point - 1)->bel).y + : ctx->getBelLocation(cells.at(pivot_point - 1)->bel).x); + for (size_t i = 0; i < cells.size(); i++) { + Loc loc = ctx->getBelLocation(cells.at(i)->bel); + ((yaxis ? loc.y : loc.x) <= pivot_coord ? l.cells : r.cells).push_back(cells.at(i)); + } + if (yaxis) { + l.x0 = r.x0 = x0; + l.x1 = r.x1 = x1; + l.y0 = y0; + l.y1 = pivot_coord; + r.y0 = (pivot_coord == y1) ? y1 : (pivot_coord + 1); + r.y1 = y1; + } else { + l.y0 = r.y0 = y0; + l.y1 = r.y1 = y1; + l.x0 = x0; + l.x1 = pivot_coord; + r.x0 = (pivot_coord == x1) ? x1 : (pivot_coord + 1); + r.x1 = x1; + } + } +}; + +typedef int64_t wirelen_t; + +struct NetBB +{ + // Actual bounding box + int x0 = 0, x1 = 0, y0 = 0, y1 = 0; + // Number of cells at each extremity + int nx0 = 0, nx1 = 0, ny0 = 0, ny1 = 0; + wirelen_t hpwl(const ParallelRefineCfg &cfg) const + { + return wirelen_t(cfg.hpwl_scale_x * (x1 - x0) + cfg.hpwl_scale_y * (y1 - y0)); + } + static NetBB compute(const Context *ctx, const NetInfo *net, const dict<IdString, BelId> *cell2bel = nullptr) + { + NetBB result{}; + if (!net->driver.cell) + return result; + auto bel_loc = [&](const CellInfo *cell) { + BelId bel = cell2bel ? cell2bel->at(cell->name) : cell->bel; + return ctx->getBelLocation(bel); + }; + result.nx0 = result.nx1 = result.ny0 = result.ny1 = 1; + Loc drv_loc = bel_loc(net->driver.cell); + result.x0 = result.x1 = drv_loc.x; + result.y0 = result.y1 = drv_loc.y; + for (auto &usr : net->users) { + Loc l = bel_loc(usr.cell); + if (l.x == result.x0) + ++result.nx0; // on the edge + else if (l.x < result.x0) { + result.x0 = l.x; // extends the edge + result.nx0 = 1; + } + if (l.x == result.x1) + ++result.nx1; // on the edge + else if (l.x > result.x1) { + result.x1 = l.x; // extends the edge + result.nx1 = 1; + } + if (l.y == result.y0) + ++result.ny0; // on the edge + else if (l.y < result.y0) { + result.y0 = l.y; // extends the edge + result.ny0 = 1; + } + if (l.y == result.y1) + ++result.ny1; // on the edge + else if (l.y > result.y1) { + result.y1 = l.y; // extends the edge + result.ny1 = 1; + } + } + return result; + } +}; + +struct GlobalState +{ + explicit GlobalState(Context *ctx, ParallelRefineCfg cfg) : ctx(ctx), cfg(cfg), bels(ctx, false, 64), tmg(ctx){}; + Context *ctx; + ParallelRefineCfg cfg; + FastBels bels; + std::vector<NetInfo *> flat_nets; // flat array of all nets in the design for fast referencing by index + std::vector<NetBB> last_bounds; + std::vector<std::vector<double>> last_tmg_costs; + dict<IdString, NetBB> region_bounds; + TimingAnalyser tmg; + + std::shared_timed_mutex archapi_mutex; + + double temperature = 1e-7; + int radius = 3; + + wirelen_t total_wirelen = 0; + double total_timing_cost = 0; + + double get_timing_cost(const NetInfo *net, store_index<PortRef> user, + const dict<IdString, BelId> *cell2bel = nullptr) + { + if (!net->driver.cell) + return 0; + const auto &sink = net->users.at(user); + IdString driver_pin, sink_pin; + // Pick the first pin for a prediction; assume all will be similar enouhg + for (auto pin : ctx->getBelPinsForCellPin(net->driver.cell, net->driver.port)) { + driver_pin = pin; + break; + } + for (auto pin : ctx->getBelPinsForCellPin(sink.cell, sink.port)) { + sink_pin = pin; + break; + } + float crit = tmg.get_criticality(CellPortKey(sink)); + BelId src_bel = cell2bel ? cell2bel->at(net->driver.cell->name) : net->driver.cell->bel; + BelId dst_bel = cell2bel ? cell2bel->at(sink.cell->name) : sink.cell->bel; + double delay = ctx->getDelayNS(ctx->predictDelay(src_bel, driver_pin, dst_bel, sink_pin)); + return delay * std::pow(crit, cfg.crit_exp); + } + + bool skip_net(const NetInfo *net) const + { + if (!net->driver.cell) + return true; + if (ctx->getBelGlobalBuf(net->driver.cell->bel)) + return true; + return false; + } + bool timing_skip_net(const NetInfo *net) const + { + if (!net->driver.cell) + return true; + int cc; + auto cls = ctx->getPortTimingClass(net->driver.cell, net->driver.port, cc); + if (cls == TMG_IGNORE || cls == TMG_GEN_CLOCK) + return true; + return false; + } + // .... +}; + +struct ThreadState +{ + Context *ctx; // Nextpnr context pointer + GlobalState &g; // Refinement engine state + int idx; // Index of the thread + DeterministicRNG rng; // Local RNG + // The cell partition that the thread works on + Partition p; + // Mapping from design-wide net index to thread-wide net index -- not all nets are in all partitions, so we can + // optimise + std::vector<int> thread_net_idx; + // List of nets inside the partition; and their committed bounding boxes & timing costs from the thread's + // perspective + std::vector<NetInfo *> thread_nets; + std::vector<NetBB> net_bounds; + std::vector<std::vector<double>> arc_tmg_cost; + std::vector<bool> ignored_nets, tmg_ignored_nets; + bool arch_state_dirty = false; + // Our local cell-bel map; that won't be affected by out-of-partition moves + dict<IdString, BelId> local_cell2bel; + + // Data on an inflight move + dict<IdString, std::pair<BelId, BelId>> moved_cells; // cell -> (old; new) + // For cluster moves only + std::vector<std::pair<CellInfo *, Loc>> cell_rel; + // For incremental wirelength and delay updates + wirelen_t wirelen_delta = 0; + double timing_delta = 0; + + // Total made and accepted moved + int n_move = 0, n_accept = 0; + + enum BoundChange + { + NO_CHANGE, + CELL_MOVED_INWARDS, + CELL_MOVED_OUTWARDS, + FULL_RECOMPUTE + }; + // Wirelen related are handled on a per-axis basis to reduce + struct AxisChanges + { + std::vector<int> bounds_changed_nets; + std::vector<BoundChange> already_bounds_changed; + }; + std::array<AxisChanges, 2> axes; + std::vector<NetBB> new_net_bounds; + + std::vector<std::vector<bool>> already_timing_changed; + std::vector<std::pair<int, store_index<PortRef>>> timing_changed_arcs; + std::vector<double> new_timing_costs; + + ThreadState(Context *ctx, GlobalState &g, int idx) : ctx(ctx), g(g), idx(idx){}; + void set_partition(const Partition &part) + { + p = part; + thread_nets.clear(); + thread_net_idx.resize(g.flat_nets.size()); + std::fill(thread_net_idx.begin(), thread_net_idx.end(), -1); + // Determine the set of nets that are within the thread; and therefore we care about + for (auto thread_cell : part.cells) { + for (auto &port : thread_cell->ports) { + if (!port.second.net) + continue; + int global_idx = port.second.net->udata; + auto &thread_idx = thread_net_idx.at(global_idx); + // Already added to the set + if (thread_idx != -1) + continue; + thread_idx = thread_nets.size(); + thread_nets.push_back(port.second.net); + } + } + tmg_ignored_nets.clear(); + ignored_nets.clear(); + for (auto tn : thread_nets) { + ignored_nets.push_back(g.skip_net(tn)); + tmg_ignored_nets.push_back(g.timing_skip_net(tn)); + } + // Set up the original cell-bel map for all nets inside the thread + local_cell2bel.clear(); + for (NetInfo *net : thread_nets) { + if (net->driver.cell) + local_cell2bel[net->driver.cell->name] = net->driver.cell->bel; + for (auto &usr : net->users) + local_cell2bel[usr.cell->name] = usr.cell->bel; + } + } + void setup_initial_state() + { + // Setup initial net bounding boxes and timing costs + net_bounds.clear(); + arc_tmg_cost.clear(); + for (auto tn : thread_nets) { + net_bounds.push_back(g.last_bounds.at(tn->udata)); + arc_tmg_cost.push_back(g.last_tmg_costs.at(tn->udata)); + } + new_net_bounds = net_bounds; + for (int j = 0; j < 2; j++) { + auto &a = axes.at(j); + a.already_bounds_changed.resize(net_bounds.size()); + } + already_timing_changed.clear(); + already_timing_changed.resize(net_bounds.size()); + for (size_t i = 0; i < thread_nets.size(); i++) + already_timing_changed.at(i) = std::vector<bool>(thread_nets.at(i)->users.capacity()); + } + bool bounds_check(BelId bel) + { + Loc l = ctx->getBelLocation(bel); + if (l.x < p.x0 || l.x > p.x1 || l.y < p.y0 || l.y > p.y1) + return false; + return true; + } + bool bind_move() + { + std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex); + for (auto &entry : moved_cells) { + ctx->unbindBel(entry.second.first); + } + bool success = true; + for (auto &entry : moved_cells) { + // Make sure targets are available before we bind them + if (!ctx->checkBelAvail(entry.second.second)) { + success = false; + break; + } + ctx->bindBel(entry.second.second, ctx->cells.at(entry.first).get(), STRENGTH_WEAK); + } + arch_state_dirty = true; + return success; + } + bool check_validity() + { + std::shared_lock<std::shared_timed_mutex> l(g.archapi_mutex); + bool result = true; + for (auto e : moved_cells) { + if (!ctx->isBelLocationValid(e.second.first)) { + // Have to check old; too; as unbinding a bel could make a placement illegal by virtue of no longer + // enabling dedicated routes to be used + result = false; + break; + } + if (!ctx->isBelLocationValid(e.second.second)) { + result = false; + break; + } + } + return result; + } + void revert_move() + { + if (arch_state_dirty) { + // If changes to the arch state were made, revert them by restoring original cell bindings + std::unique_lock<std::shared_timed_mutex> l(g.archapi_mutex); + for (auto &entry : moved_cells) { + BelId curr_bound = ctx->cells.at(entry.first)->bel; + if (curr_bound != BelId()) + ctx->unbindBel(curr_bound); + } + for (auto &entry : moved_cells) { + ctx->bindBel(entry.second.first, ctx->cells.at(entry.first).get(), STRENGTH_WEAK); + } + arch_state_dirty = false; + } + for (auto &entry : moved_cells) + local_cell2bel[entry.first] = entry.second.first; + } + void commit_move() + { + arch_state_dirty = false; + for (auto &axis : axes) { + for (auto bc : axis.bounds_changed_nets) { + // Commit updated net bounds + net_bounds.at(bc) = new_net_bounds.at(bc); + } + } + if (g.cfg.timing_driven) { + NPNR_ASSERT(timing_changed_arcs.size() == new_timing_costs.size()); + for (size_t i = 0; i < timing_changed_arcs.size(); i++) { + auto arc = timing_changed_arcs.at(i); + arc_tmg_cost.at(arc.first).at(arc.second.idx()) = new_timing_costs.at(i); + } + } + } + void compute_changes_for_cell(CellInfo *cell, BelId old_bel, BelId new_bel) + { + Loc new_loc = ctx->getBelLocation(new_bel); + Loc old_loc = ctx->getBelLocation(old_bel); + for (const auto &port : cell->ports) { + NetInfo *pn = port.second.net; + if (!pn) + continue; + int idx = thread_net_idx.at(pn->udata); + if (ignored_nets.at(idx)) + continue; + NetBB &new_bounds = new_net_bounds.at(idx); + // For the x-axis (i=0) and y-axis (i=1) + for (int i = 0; i < 2; i++) { + auto &axis = axes.at(i); + // New and old on this axis + int new_pos = i ? new_loc.y : new_loc.x, old_pos = i ? old_loc.y : old_loc.x; + // References to updated bounding box entries + auto &b0 = i ? new_bounds.y0 : new_bounds.x0; + auto &n0 = i ? new_bounds.ny0 : new_bounds.nx0; + auto &b1 = i ? new_bounds.y1 : new_bounds.x1; + auto &n1 = i ? new_bounds.ny1 : new_bounds.nx1; + auto &change = axis.already_bounds_changed.at(idx); + // Lower bound + if (new_pos < b0) { + // Further out than current lower bound + b0 = new_pos; + n0 = 1; + if (change == NO_CHANGE) { + change = CELL_MOVED_OUTWARDS; + axis.bounds_changed_nets.push_back(idx); + } + } else if (new_pos == b0 && old_pos > b0) { + // Moved from inside into current bound + ++n0; + if (change == NO_CHANGE) { + change = CELL_MOVED_OUTWARDS; + axis.bounds_changed_nets.push_back(idx); + } + } else if (old_pos == b0 && new_pos > b0) { + // Moved from current bound to inside + if (change == NO_CHANGE) + axis.bounds_changed_nets.push_back(idx); + if (n0 == 1) { + // Was the last cell on the bound; have to do a full recompute + change = FULL_RECOMPUTE; + } else { + --n0; + if (change == NO_CHANGE) + change = CELL_MOVED_INWARDS; + } + } + // Upper bound + if (new_pos > b1) { + // Further out than current upper bound + b1 = new_pos; + n1 = new_pos; + if (change == NO_CHANGE) { + change = CELL_MOVED_OUTWARDS; + axis.bounds_changed_nets.push_back(idx); + } + } else if (new_pos == b1 && old_pos < b1) { + // Moved onto current bound + ++n1; + if (change == NO_CHANGE) { + change = CELL_MOVED_OUTWARDS; + axis.bounds_changed_nets.push_back(idx); + } + } else if (old_pos == b1 && new_pos < b1) { + // Moved from current bound to inside + if (change == NO_CHANGE) + axis.bounds_changed_nets.push_back(idx); + if (n1 == 1) { + // Was the last cell on the bound; have to do a full recompute + change = FULL_RECOMPUTE; + } else { + --n1; + if (change == NO_CHANGE) + change = CELL_MOVED_INWARDS; + } + } + } + // Timing updates if timing driven + if (g.cfg.timing_driven && !tmg_ignored_nets.at(idx)) { + if (port.second.type == PORT_OUT) { + int cc; + TimingPortClass cls = ctx->getPortTimingClass(cell, port.first, cc); + if (cls != TMG_IGNORE) { + for (auto usr : pn->users.enumerate()) + if (!already_timing_changed.at(idx).at(usr.index.idx())) { + timing_changed_arcs.emplace_back(std::make_pair(idx, usr.index)); + already_timing_changed.at(idx).at(usr.index.idx()) = true; + } + } + } else { + auto usr = port.second.user_idx; + if (!already_timing_changed.at(idx).at(usr.idx())) { + timing_changed_arcs.emplace_back(std::make_pair(idx, usr)); + already_timing_changed.at(idx).at(usr.idx()) = true; + } + } + } + } + } + void compute_total_change() + { + auto &xa = axes.at(0), &ya = axes.at(1); + for (auto &bc : xa.bounds_changed_nets) + if (xa.already_bounds_changed.at(bc) == FULL_RECOMPUTE) + new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel); + for (auto &bc : ya.bounds_changed_nets) + if (xa.already_bounds_changed.at(bc) != FULL_RECOMPUTE && + ya.already_bounds_changed.at(bc) == FULL_RECOMPUTE) + new_net_bounds.at(bc) = NetBB::compute(ctx, thread_nets.at(bc), &local_cell2bel); + for (auto &bc : xa.bounds_changed_nets) + wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg)); + for (auto &bc : ya.bounds_changed_nets) + if (xa.already_bounds_changed.at(bc) == NO_CHANGE) + wirelen_delta += (new_net_bounds.at(bc).hpwl(g.cfg) - net_bounds.at(bc).hpwl(g.cfg)); + if (g.cfg.timing_driven) { + NPNR_ASSERT(new_timing_costs.empty()); + for (auto arc : timing_changed_arcs) { + double new_cost = g.get_timing_cost(thread_nets.at(arc.first), arc.second, &local_cell2bel); + timing_delta += (new_cost - arc_tmg_cost.at(arc.first).at(arc.second.idx())); + new_timing_costs.push_back(new_cost); + } + } + } + void reset_move_state() + { + moved_cells.clear(); + cell_rel.clear(); + for (auto &axis : axes) { + for (auto bc : axis.bounds_changed_nets) { + new_net_bounds.at(bc) = net_bounds.at(bc); + axis.already_bounds_changed[bc] = NO_CHANGE; + } + axis.bounds_changed_nets.clear(); + } + for (auto &arc : timing_changed_arcs) { + already_timing_changed.at(arc.first).at(arc.second.idx()) = false; + } + timing_changed_arcs.clear(); + new_timing_costs.clear(); + wirelen_delta = 0; + timing_delta = 0; + } + + bool accept_move() + { + double delta = g.cfg.lambda * (timing_delta / std::max<double>(epsilon, g.total_timing_cost)) + + (1.0 - g.cfg.lambda) * (double(wirelen_delta) / std::max<double>(epsilon, g.total_wirelen)); + return delta < 0 || + (g.temperature > 1e-8 && (rng.rng() / float(0x3fffffff)) <= std::exp(-delta / g.temperature)); + } + + bool add_to_move(CellInfo *cell, BelId old_bel, BelId new_bel) + { + if (!bounds_check(old_bel) || !bounds_check(new_bel)) + return false; + if (!ctx->isValidBelForCellType(cell->type, new_bel)) + return false; + NPNR_ASSERT(!moved_cells.count(cell->name)); + moved_cells[cell->name] = std::make_pair(old_bel, new_bel); + local_cell2bel[cell->name] = new_bel; + compute_changes_for_cell(cell, old_bel, new_bel); + return true; + } + + static constexpr double epsilon = 1e-20; + bool single_cell_swap(CellInfo *cell, BelId new_bel) + { + NPNR_ASSERT(moved_cells.empty()); + BelId old_bel = cell->bel; + CellInfo *bound = ctx->getBoundBelCell(new_bel); + if (bound && (bound->belStrength > STRENGTH_STRONG || bound->cluster != ClusterId())) + return false; + if (!add_to_move(cell, old_bel, new_bel)) + goto fail; + if (bound && !add_to_move(bound, new_bel, old_bel)) + goto fail; + compute_total_change(); + // SA acceptance criteria + + if (!accept_move()) { + // SA fail + goto fail; + } + // Check validity rules + if (!bind_move()) + goto fail; + if (!check_validity()) + goto fail; + // Accepted! + commit_move(); + reset_move_state(); + return true; + fail: + revert_move(); + reset_move_state(); + return false; + } + + bool chain_swap(CellInfo *root_cell, BelId new_root_bel) + { + NPNR_ASSERT(moved_cells.empty()); + std::queue<std::pair<ClusterId, BelId>> displaced_clusters; + pool<BelId> used_bels; + displaced_clusters.emplace(root_cell->cluster, new_root_bel); + while (!displaced_clusters.empty()) { + std::vector<std::pair<CellInfo *, BelId>> dest_bels; + auto cursor = displaced_clusters.front(); + displaced_clusters.pop(); + if (!ctx->getClusterPlacement(cursor.first, cursor.second, dest_bels)) + goto fail; + for (const auto &db : dest_bels) { + BelId old_bel = db.first->bel; + if (moved_cells.count(db.first->name)) + goto fail; + if (!add_to_move(db.first, old_bel, db.second)) + goto fail; + if (used_bels.count(db.second)) + goto fail; + used_bels.insert(db.second); + + CellInfo *bound = ctx->getBoundBelCell(db.second); + if (bound) { + if (moved_cells.count(bound->name)) { + // Don't move a cell multiple times in the same go + goto fail; + } else if (bound->belStrength > STRENGTH_STRONG) { + goto fail; + } else if (bound->cluster != ClusterId()) { + // Displace the entire cluster + Loc old_loc = ctx->getBelLocation(old_bel); + Loc bound_loc = ctx->getBelLocation(bound->bel); + Loc root_loc = ctx->getBelLocation(ctx->getClusterRootCell(bound->cluster)->bel); + BelId new_root = ctx->getBelByLocation(Loc(old_loc.x + (root_loc.x - bound_loc.x), + old_loc.y + (root_loc.y - bound_loc.y), + old_loc.z + (root_loc.z - bound_loc.z))); + if (new_root == BelId()) + goto fail; + displaced_clusters.emplace(bound->cluster, new_root); + } else { + // Single cell swap + if (used_bels.count(old_bel)) + goto fail; + used_bels.insert(old_bel); + if (!add_to_move(bound, bound->bel, old_bel)) + goto fail; + } + } else if (!ctx->checkBelAvail(db.second)) { + goto fail; + } + } + } + compute_total_change(); + // SA acceptance criteria + + if (!accept_move()) { + // SA fail + goto fail; + } + // Check validity rules + if (!bind_move()) + goto fail; + if (!check_validity()) + goto fail; + // Accepted! + commit_move(); + reset_move_state(); + return true; + fail: + revert_move(); + reset_move_state(); + return false; + } + + BelId random_bel_for_cell(CellInfo *cell, int force_z = -1) + { + IdString targetType = cell->type; + Loc curr_loc = ctx->getBelLocation(cell->bel); + int count = 0; + + int dx = g.radius, dy = g.radius; + if (cell->region != nullptr && cell->region->constr_bels) { + dx = std::min(g.cfg.hpwl_scale_x * g.radius, + (g.region_bounds[cell->region->name].x1 - g.region_bounds[cell->region->name].x0) + 1); + dy = std::min(g.cfg.hpwl_scale_y * g.radius, + (g.region_bounds[cell->region->name].y1 - g.region_bounds[cell->region->name].y0) + 1); + // Clamp location to within bounds + curr_loc.x = std::max(g.region_bounds[cell->region->name].x0, curr_loc.x); + curr_loc.x = std::min(g.region_bounds[cell->region->name].x1, curr_loc.x); + curr_loc.y = std::max(g.region_bounds[cell->region->name].y0, curr_loc.y); + curr_loc.y = std::min(g.region_bounds[cell->region->name].y1, curr_loc.y); + } + + FastBels::FastBelsData *bel_data; + auto type_cnt = g.bels.getBelsForCellType(targetType, &bel_data); + + while (true) { + int nx = rng.rng(2 * dx + 1) + std::max(curr_loc.x - dx, 0); + int ny = rng.rng(2 * dy + 1) + std::max(curr_loc.y - dy, 0); + if (type_cnt < 64) + nx = ny = 0; + if (nx >= int(bel_data->size())) + continue; + if (ny >= int(bel_data->at(nx).size())) + continue; + const auto &fb = bel_data->at(nx).at(ny); + if (fb.size() == 0) + continue; + BelId bel = fb.at(rng.rng(int(fb.size()))); + if (!bounds_check(bel)) + continue; + if (force_z != -1) { + Loc loc = ctx->getBelLocation(bel); + if (loc.z != force_z) + continue; + } + if (!cell->testRegion(bel)) + continue; + count++; + return bel; + } + } + + void run_iter() + { + setup_initial_state(); + n_accept = 0; + n_move = 0; + for (int m = 0; m < g.cfg.inner_iters; m++) { + for (auto cell : p.cells) { + if (cell->belStrength > STRENGTH_STRONG) + continue; + if (cell->cluster != ClusterId()) { + if (cell != ctx->getClusterRootCell(cell->cluster)) + continue; // only move cluster root + Loc old_loc = ctx->getBelLocation(cell->bel); + BelId new_root = random_bel_for_cell(cell, old_loc.z); + if (new_root == BelId() || new_root == cell->bel) + continue; + ++n_move; + if (chain_swap(cell, new_root)) + ++n_accept; + } else { + BelId new_bel = random_bel_for_cell(cell); + if (new_bel == BelId() || new_bel == cell->bel) + continue; + ++n_move; + if (single_cell_swap(cell, new_bel)) + ++n_accept; + } + } + } + } +}; + +struct ParallelRefine +{ + Context *ctx; + GlobalState g; + std::vector<ThreadState> t; + ParallelRefine(Context *ctx, ParallelRefineCfg cfg) : ctx(ctx), g(ctx, cfg) + { + g.flat_nets.reserve(ctx->nets.size()); + for (auto &net : ctx->nets) { + net.second->udata = g.flat_nets.size(); + g.flat_nets.push_back(net.second.get()); + } + // Setup per thread context + for (int i = 0; i < cfg.threads; i++) { + t.emplace_back(ctx, g, i); + } + // Setup region bounds + for (auto ®ion : ctx->region) { + Region *r = region.second.get(); + NetBB bb; + if (r->constr_bels) { + bb.x0 = std::numeric_limits<int>::max(); + bb.x1 = std::numeric_limits<int>::min(); + bb.y0 = std::numeric_limits<int>::max(); + bb.y1 = std::numeric_limits<int>::min(); + for (auto bel : r->bels) { + Loc loc = ctx->getBelLocation(bel); + bb.x0 = std::min(bb.x0, loc.x); + bb.x1 = std::max(bb.x1, loc.x); + bb.y0 = std::min(bb.y0, loc.y); + bb.y1 = std::max(bb.y1, loc.y); + } + } else { + bb.x0 = 0; + bb.y0 = 0; + bb.x1 = ctx->getGridDimX(); + bb.y1 = ctx->getGridDimY(); + } + g.region_bounds[r->name] = bb; + } + // Setup fast bels map + pool<IdString> cell_types_in_use; + for (auto &cell : ctx->cells) { + IdString cell_type = cell.second->type; + cell_types_in_use.insert(cell_type); + } + + for (auto cell_type : cell_types_in_use) { + g.bels.addCellType(cell_type); + } + }; + std::vector<Partition> parts; + void do_partition() + { + parts.clear(); + parts.emplace_back(ctx); + bool yaxis = false; + while (parts.size() < t.size()) { + std::vector<Partition> next(parts.size() * 2); + for (size_t i = 0; i < parts.size(); i++) { + // Randomly permute pivot every iteration so we get different thread boundaries + const float delta = 0.1; + float pivot = (0.5 - (delta / 2)) + (delta / 2) * (ctx->rng(10000) / 10000.0f); + parts.at(i).split(ctx, yaxis, pivot, next.at(i * 2), next.at(i * 2 + 1)); + } + std::swap(parts, next); + yaxis = !yaxis; + } + + NPNR_ASSERT(parts.size() == t.size()); + // TODO: thread pool to make this worthwhile... + std::vector<std::thread> workers; + for (size_t i = 0; i < t.size(); i++) { + workers.emplace_back([i, this]() { t.at(i).set_partition(parts.at(i)); }); + } + for (auto &w : workers) + w.join(); + } + + void update_global_costs() + { + g.last_bounds.resize(g.flat_nets.size()); + g.last_tmg_costs.resize(g.flat_nets.size()); + g.total_wirelen = 0; + g.total_timing_cost = 0; + for (size_t i = 0; i < g.flat_nets.size(); i++) { + NetInfo *ni = g.flat_nets.at(i); + if (g.skip_net(ni)) + continue; + g.last_bounds.at(i) = NetBB::compute(ctx, ni); + g.total_wirelen += g.last_bounds.at(i).hpwl(g.cfg); + if (!g.timing_skip_net(ni)) { + auto &tc = g.last_tmg_costs.at(i); + tc.resize(ni->users.capacity()); + for (auto usr : ni->users.enumerate()) { + tc.at(usr.index.idx()) = g.get_timing_cost(ni, usr.index); + g.total_timing_cost += tc.at(usr.index.idx()); + } + } + } + } + void run() + { + + ScopeLock<Context> lock(ctx); + auto refine_start = std::chrono::high_resolution_clock::now(); + + g.tmg.setup_only = true; + g.tmg.setup(); + do_partition(); + log_info("Running parallel refinement with %d threads.\n", int(t.size())); + int iter = 1; + bool done = false; + update_global_costs(); + double avg_wirelen = g.total_wirelen; + wirelen_t min_wirelen = g.total_wirelen; + while (true) { + if (iter > 1) { + if (g.total_wirelen >= min_wirelen) { + done = true; + } else if (g.total_wirelen < min_wirelen) { + min_wirelen = g.total_wirelen; + } + int n_accept = 0, n_move = 0; + for (auto &t_data : t) { + n_accept += t_data.n_accept; + n_move += t_data.n_move; + } + double r_accept = n_accept / double(n_move); + if (g.total_wirelen < (0.95 * avg_wirelen) && g.total_wirelen > 0) { + avg_wirelen = 0.8 * avg_wirelen + 0.2 * g.total_wirelen; + } else { + if (r_accept > 0.15 && g.radius > 1) { + g.temperature *= 0.95; + } else { + g.temperature *= 0.8; + } + } + if ((iter % 10) == 0 && g.radius > 1) + --g.radius; + } + + if ((iter == 1) || ((iter % 5) == 0) || done) + log_info(" at iteration #%d: temp = %f, timing cost = " + "%.0f, wirelen = %.0f\n", + iter, g.temperature, double(g.total_timing_cost), double(g.total_wirelen)); + + if (done) + break; + + do_partition(); + + std::vector<std::thread> workers; + workers.reserve(t.size()); + for (int j = 0; j < int(t.size()); j++) + workers.emplace_back([this, j]() { t.at(j).run_iter(); }); + for (auto &w : workers) + w.join(); + g.tmg.run(); + update_global_costs(); + iter++; + ctx->yield(); + } + auto refine_end = std::chrono::high_resolution_clock::now(); + log_info("Placement refine time %.02fs\n", std::chrono::duration<float>(refine_end - refine_start).count()); + } +}; +} // namespace + +ParallelRefineCfg::ParallelRefineCfg(Context *ctx) +{ + timing_driven = ctx->setting<bool>("timing_driven"); + threads = ctx->setting<int>("threads", 8); + // snap to nearest power of two; and minimum thread size + int actual_threads = 1; + while ((actual_threads * 2) <= threads && (int(ctx->cells.size()) / (actual_threads * 2)) >= min_thread_size) + actual_threads *= 2; + threads = actual_threads; + hpwl_scale_x = 1; + hpwl_scale_y = 1; +} + +bool parallel_refine(Context *ctx, ParallelRefineCfg cfg) +{ + // TODO + ParallelRefine refine(ctx, cfg); + refine.run(); + timing_analysis(ctx); + return true; +} + +NEXTPNR_NAMESPACE_END diff --git a/common/parallel_refine.h b/common/parallel_refine.h new file mode 100644 index 00000000..556317cd --- /dev/null +++ b/common/parallel_refine.h @@ -0,0 +1,43 @@ +/* + * nextpnr -- Next Generation Place and Route + * + * Copyright (C) 2021 gatecat <gatecat@ds0.me> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#ifndef PARALLEL_REFINE_H +#define PARALLEL_REFINE_H + +#include "nextpnr.h" + +NEXTPNR_NAMESPACE_BEGIN + +struct ParallelRefineCfg +{ + ParallelRefineCfg(Context *ctx); + bool timing_driven; + int threads; + int hpwl_scale_x, hpwl_scale_y; + double lambda = 0.5f; + float crit_exp = 8; + int inner_iters = 15; + int min_thread_size = 500; +}; + +bool parallel_refine(Context *ctx, ParallelRefineCfg cfg); + +NEXTPNR_NAMESPACE_END + +#endif diff --git a/common/placer_heap.cc b/common/placer_heap.cc index 5b43dc72..5ffb7642 100644 --- a/common/placer_heap.cc +++ b/common/placer_heap.cc @@ -46,6 +46,7 @@ #include "fast_bels.h" #include "log.h" #include "nextpnr.h" +#include "parallel_refine.h" #include "place_common.h" #include "placer1.h" #include "scope_lock.h" @@ -346,8 +347,14 @@ class HeAPPlacer ctx->check(); lock.unlock_early(); - if (!placer1_refine(ctx, Placer1Cfg(ctx))) { - return false; + if (cfg.parallelRefine) { + if (!parallel_refine(ctx, ParallelRefineCfg(ctx))) { + return false; + } + } else { + if (!placer1_refine(ctx, Placer1Cfg(ctx))) { + return false; + } } return true; @@ -1786,6 +1793,8 @@ PlacerHeapCfg::PlacerHeapCfg(Context *ctx) beta = ctx->setting<float>("placerHeap/beta"); criticalityExponent = ctx->setting<int>("placerHeap/criticalityExponent"); timingWeight = ctx->setting<int>("placerHeap/timingWeight"); + parallelRefine = ctx->setting<bool>("placerHeap/parallelRefine", false); + timing_driven = ctx->setting<bool>("timing_driven"); solverTolerance = 1e-5; placeAllAtOnce = false; diff --git a/common/placer_heap.h b/common/placer_heap.h index bb40a126..9c62869e 100644 --- a/common/placer_heap.h +++ b/common/placer_heap.h @@ -41,6 +41,7 @@ struct PlacerHeapCfg bool timing_driven; float solverTolerance; bool placeAllAtOnce; + bool parallelRefine; int hpwl_scale_x, hpwl_scale_y; int spread_scale_x, spread_scale_y; |