1 files changed, 191 insertions, 52 deletions
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index f0d7b9fc7..b4ffa87cd 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -36,22 +36,48 @@
 #include <map>
 #include <algorithm>
 #include <memory>
+#include <functional>
 #include <sstream>
 
 #include <backends/cxxrtl/cxxrtl_capi.h>
 
+#ifndef __has_attribute
+#	define __has_attribute(x) 0
+#endif
+
 // CXXRTL essentially uses the C++ compiler as a hygienic macro engine that feeds an instruction selector.
 // It generates a lot of specialized template functions with relatively large bodies that, when inlined
 // into the caller and (for those with loops) unrolled, often expose many new optimization opportunities.
 // Because of this, most of the CXXRTL runtime must be always inlined for best performance.
-#ifndef __has_attribute
-#	define __has_attribute(x) 0
-#endif
 #if __has_attribute(always_inline)
 #define CXXRTL_ALWAYS_INLINE inline __attribute__((__always_inline__))
 #else
 #define CXXRTL_ALWAYS_INLINE inline
 #endif
+// Conversely, some functions in the generated code are extremely large yet very cold, with both of these
+// properties being extreme enough to confuse C++ compilers into spending pathological amounts of time
+// on a futile (the code becomes worse) attempt to optimize the least important parts of code.
+#if __has_attribute(optnone)
+#define CXXRTL_EXTREMELY_COLD __attribute__((__optnone__))
+#elif __has_attribute(optimize)
+#define CXXRTL_EXTREMELY_COLD __attribute__((__optimize__(0)))
+#else
+#define CXXRTL_EXTREMELY_COLD
+#endif
+
+// CXXRTL uses assert() to check for C++ contract violations (which may result in e.g. undefined behavior
+// of the simulation code itself), and CXXRTL_ASSERT to check for RTL contract violations (which may at
+// most result in undefined simulation results).
+//
+// Though by default, CXXRTL_ASSERT() expands to assert(), it may be overridden e.g. when integrating
+// the simulation into another process that should survive violating RTL contracts.
+#ifndef CXXRTL_ASSERT
+#ifndef CXXRTL_NDEBUG
+#define CXXRTL_ASSERT(x) assert(x)
+#else
+#define CXXRTL_ASSERT(x)
+#endif
+#endif
 
 namespace cxxrtl {
 
@@ -96,9 +122,11 @@ struct value : public expr_base<value<Bits>> {
 	explicit constexpr value(Init ...init) : data{init...} {}
 
 	value(const value<Bits> &) = default;
-	value(value<Bits> &&) = default;
 	value<Bits> &operator=(const value<Bits> &) = default;
 
+	value(value<Bits> &&) = default;
+	value<Bits> &operator=(value<Bits> &&) = default;
+
 	// A (no-op) helper that forces the cast to value<>.
 	CXXRTL_ALWAYS_INLINE
 	const value<Bits> &val() const {
@@ -289,6 +317,14 @@ struct value : public expr_base<value<Bits>> {
 		return sext_cast<NewBits>()(*this);
 	}
 
+	// Bit replication is far more efficient than the equivalent concatenation.
+	template<size_t Count>
+	CXXRTL_ALWAYS_INLINE
+	value<Bits * Count> repeat() const {
+		static_assert(Bits == 1, "repeat() is implemented only for 1-bit values");
+		return *this ? value<Bits * Count>().bit_not() : value<Bits * Count>();
+	}
+
 	// Operations with run-time parameters (offsets, amounts, etc).
 	//
 	// These operations are used for computations.
@@ -421,6 +457,42 @@ struct value : public expr_base<value<Bits>> {
 		return shr<AmountBits, /*Signed=*/true>(amount);
 	}
 
+	template<size_t ResultBits, size_t SelBits>
+	value<ResultBits> bmux(const value<SelBits> &sel) const {
+		static_assert(ResultBits << SelBits == Bits, "invalid sizes used in bmux()");
+		size_t amount = sel.data[0] * ResultBits;
+		size_t shift_chunks = amount / chunk::bits;
+		size_t shift_bits   = amount % chunk::bits;
+		value<ResultBits> result;
+		chunk::type carry = 0;
+		if (ResultBits % chunk::bits + shift_bits > chunk::bits)
+			carry = data[result.chunks + shift_chunks] << (chunk::bits - shift_bits);
+		for (size_t n = 0; n < result.chunks; n++) {
+			result.data[result.chunks - 1 - n] = carry | (data[result.chunks + shift_chunks - 1 - n] >> shift_bits);
+			carry = (shift_bits == 0) ? 0
+				: data[result.chunks + shift_chunks - 1 - n] << (chunk::bits - shift_bits);
+		}
+		return result;
+	}
+
+	template<size_t ResultBits, size_t SelBits>
+	value<ResultBits> demux(const value<SelBits> &sel) const {
+		static_assert(Bits << SelBits == ResultBits, "invalid sizes used in demux()");
+		size_t amount = sel.data[0] * Bits;
+		size_t shift_chunks = amount / chunk::bits;
+		size_t shift_bits   = amount % chunk::bits;
+		value<ResultBits> result;
+		chunk::type carry = 0;
+		for (size_t n = 0; n < chunks; n++) {
+			result.data[shift_chunks + n] = (data[n] << shift_bits) | carry;
+			carry = (shift_bits == 0) ? 0
+				: data[n] >> (chunk::bits - shift_bits);
+		}
+		if (Bits % chunk::bits + shift_bits > chunk::bits)
+			result.data[shift_chunks + chunks] = carry;
+		return result;
+	}
+
 	size_t ctpop() const {
 		size_t count = 0;
 		for (size_t n = 0; n < chunks; n++) {
@@ -452,10 +524,11 @@ struct value : public expr_base<value<Bits>> {
 		bool carry = CarryIn;
 		for (size_t n = 0; n < result.chunks; n++) {
 			result.data[n] = data[n] + (Invert ? ~other.data[n] : other.data[n]) + carry;
+			if (result.chunks - 1 == n)
+				result.data[result.chunks - 1] &= result.msb_mask;
 			carry = (result.data[n] <  data[n]) ||
 			        (result.data[n] == data[n] && carry);
 		}
-		result.data[result.chunks - 1] &= result.msb_mask;
 		return {result, carry};
 	}
 
@@ -642,14 +715,20 @@ struct wire {
 	value<Bits> next;
 
 	wire() = default;
-	constexpr wire(const value<Bits> &init) : curr(init), next(init) {}
+	explicit constexpr wire(const value<Bits> &init) : curr(init), next(init) {}
 	template<typename... Init>
 	explicit constexpr wire(Init ...init) : curr{init...}, next{init...} {}
 
+	// Copying and copy-assigning values is natural. If, however, a value is replaced with a wire,
+	// e.g. because a module is built with a different optimization level, then existing code could
+	// unintentionally copy a wire instead, which would create a subtle but serious bug. To make sure
+	// this doesn't happen, prohibit copying and copy-assigning wires.
 	wire(const wire<Bits> &) = delete;
-	wire(wire<Bits> &&) = default;
 	wire<Bits> &operator=(const wire<Bits> &) = delete;
 
+	wire(wire<Bits> &&) = default;
+	wire<Bits> &operator=(wire<Bits> &&) = default;
+
 	template<class IntegerT>
 	CXXRTL_ALWAYS_INLINE
 	IntegerT get() const {
@@ -679,47 +758,32 @@ std::ostream &operator<<(std::ostream &os, const wire<Bits> &val) {
 
 template<size_t Width>
 struct memory {
-	std::vector<value<Width>> data;
-
-	size_t depth() const {
-		return data.size();
-	}
+	const size_t depth;
+	std::unique_ptr<value<Width>[]> data;
 
-	memory() = delete;
-	explicit memory(size_t depth) : data(depth) {}
+	explicit memory(size_t depth) : depth(depth), data(new value<Width>[depth]) {}
 
 	memory(const memory<Width> &) = delete;
 	memory<Width> &operator=(const memory<Width> &) = delete;
 
-	// The only way to get the compiler to put the initializer in .rodata and do not copy it on stack is to stuff it
-	// into a plain array. You'd think an std::initializer_list would work here, but it doesn't, because you can't
-	// construct an initializer_list in a constexpr (or something) and so if you try to do that the whole thing is
-	// first copied on the stack (probably overflowing it) and then again into `data`.
-	template<size_t Size>
-	struct init {
-		size_t offset;
-		value<Width> data[Size];
-	};
-
-	template<size_t... InitSize>
-	explicit memory(size_t depth, const init<InitSize> &...init) : data(depth) {
-		data.resize(depth);
-		// This utterly reprehensible construct is the most reasonable way to apply a function to every element
-		// of a parameter pack, if the elements all have different types and so cannot be cast to an initializer list.
-		auto _ = {std::move(std::begin(init.data), std::end(init.data), data.begin() + init.offset)...};
-		(void)_;
+	memory(memory<Width> &&) = default;
+	memory<Width> &operator=(memory<Width> &&other) {
+		assert(depth == other.depth);
+		data = std::move(other.data);
+		write_queue = std::move(other.write_queue);
+		return *this;
 	}
 
 	// An operator for direct memory reads. May be used at any time during the simulation.
 	const value<Width> &operator [](size_t index) const {
-		assert(index < data.size());
+		assert(index < depth);
 		return data[index];
 	}
 
 	// An operator for direct memory writes. May only be used before the simulation is started. If used
 	// after the simulation is started, the design may malfunction.
 	value<Width> &operator [](size_t index) {
-		assert(index < data.size());
+		assert(index < depth);
 		return data[index];
 	}
 
@@ -744,7 +808,7 @@ struct memory {
 	std::vector<write> write_queue;
 
 	void update(size_t index, const value<Width> &val, const value<Width> &mask, int priority = 0) {
-		assert(index < data.size());
+		assert(index < depth);
 		// Queue up the write while keeping the queue sorted by priority.
 		write_queue.insert(
 			std::upper_bound(write_queue.begin(), write_queue.end(), priority,
@@ -814,35 +878,52 @@ struct metadata {
 
 typedef std::map<std::string, metadata> metadata_map;
 
-// Helper class to disambiguate values/wires and their aliases.
+// Tag class to disambiguate values/wires and their aliases.
 struct debug_alias {};
 
+// Tag declaration to disambiguate values and debug outlines.
+using debug_outline = ::_cxxrtl_outline;
+
 // This structure is intended for consumption via foreign function interfaces, like Python's ctypes.
 // Because of this it uses a C-style layout that is easy to parse rather than more idiomatic C++.
 //
 // To avoid violating strict aliasing rules, this structure has to be a subclass of the one used
 // in the C API, or it would not be possible to cast between the pointers to these.
 struct debug_item : ::cxxrtl_object {
+	// Object types.
 	enum : uint32_t {
-		VALUE  = CXXRTL_VALUE,
-		WIRE   = CXXRTL_WIRE,
-		MEMORY = CXXRTL_MEMORY,
-		ALIAS  = CXXRTL_ALIAS,
+		VALUE   = CXXRTL_VALUE,
+		WIRE    = CXXRTL_WIRE,
+		MEMORY  = CXXRTL_MEMORY,
+		ALIAS   = CXXRTL_ALIAS,
+		OUTLINE = CXXRTL_OUTLINE,
+	};
+
+	// Object flags.
+	enum : uint32_t {
+		INPUT  = CXXRTL_INPUT,
+		OUTPUT = CXXRTL_OUTPUT,
+		INOUT  = CXXRTL_INOUT,
+		DRIVEN_SYNC = CXXRTL_DRIVEN_SYNC,
+		DRIVEN_COMB = CXXRTL_DRIVEN_COMB,
+		UNDRIVEN    = CXXRTL_UNDRIVEN,
 	};
 
 	debug_item(const ::cxxrtl_object &object) : cxxrtl_object(object) {}
 
 	template<size_t Bits>
-	debug_item(value<Bits> &item, size_t lsb_offset = 0) {
+	debug_item(value<Bits> &item, size_t lsb_offset = 0, uint32_t flags_ = 0) {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
 		type    = VALUE;
+		flags   = flags_;
 		width   = Bits;
 		lsb_at  = lsb_offset;
 		depth   = 1;
 		zero_at = 0;
 		curr    = item.data;
 		next    = item.data;
+		outline = nullptr;
 	}
 
 	template<size_t Bits>
@@ -850,26 +931,30 @@ struct debug_item : ::cxxrtl_object {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
 		type    = VALUE;
+		flags   = DRIVEN_COMB;
 		width   = Bits;
 		lsb_at  = lsb_offset;
 		depth   = 1;
 		zero_at = 0;
 		curr    = const_cast<chunk_t*>(item.data);
 		next    = nullptr;
+		outline = nullptr;
 	}
 
 	template<size_t Bits>
-	debug_item(wire<Bits> &item, size_t lsb_offset = 0) {
+	debug_item(wire<Bits> &item, size_t lsb_offset = 0, uint32_t flags_ = 0) {
 		static_assert(sizeof(item.curr) == value<Bits>::chunks * sizeof(chunk_t) &&
 		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
 		              "wire<Bits> is not compatible with C layout");
 		type    = WIRE;
+		flags   = flags_;
 		width   = Bits;
 		lsb_at  = lsb_offset;
 		depth   = 1;
 		zero_at = 0;
 		curr    = item.curr.data;
 		next    = item.next.data;
+		outline = nullptr;
 	}
 
 	template<size_t Width>
@@ -877,12 +962,14 @@ struct debug_item : ::cxxrtl_object {
 		static_assert(sizeof(item.data[0]) == value<Width>::chunks * sizeof(chunk_t),
 		              "memory<Width> is not compatible with C layout");
 		type    = MEMORY;
+		flags   = 0;
 		width   = Width;
 		lsb_at  = 0;
-		depth   = item.data.size();
+		depth   = item.depth;
 		zero_at = zero_offset;
-		curr    = item.data.empty() ? nullptr : item.data[0].data;
+		curr    = item.data ? item.data[0].data : nullptr;
 		next    = nullptr;
+		outline = nullptr;
 	}
 
 	template<size_t Bits>
@@ -890,12 +977,14 @@ struct debug_item : ::cxxrtl_object {
 		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
 		              "value<Bits> is not compatible with C layout");
 		type    = ALIAS;
+		flags   = DRIVEN_COMB;
 		width   = Bits;
 		lsb_at  = lsb_offset;
 		depth   = 1;
 		zero_at = 0;
 		curr    = const_cast<chunk_t*>(item.data);
 		next    = nullptr;
+		outline = nullptr;
 	}
 
 	template<size_t Bits>
@@ -904,12 +993,45 @@ struct debug_item : ::cxxrtl_object {
 		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
 		              "wire<Bits> is not compatible with C layout");
 		type    = ALIAS;
+		flags   = DRIVEN_COMB;
 		width   = Bits;
 		lsb_at  = lsb_offset;
 		depth   = 1;
 		zero_at = 0;
 		curr    = const_cast<chunk_t*>(item.curr.data);
 		next    = nullptr;
+		outline = nullptr;
+	}
+
+	template<size_t Bits>
+	debug_item(debug_outline &group, const value<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
+		              "value<Bits> is not compatible with C layout");
+		type    = OUTLINE;
+		flags   = DRIVEN_COMB;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.data);
+		next    = nullptr;
+		outline = &group;
+	}
+
+	template<size_t Bits, class IntegerT>
+	IntegerT get() const {
+		assert(width == Bits && depth == 1);
+		value<Bits> item;
+		std::copy(curr, curr + value<Bits>::chunks, item.data);
+		return item.template get<IntegerT>();
+	}
+
+	template<size_t Bits, class IntegerT>
+	void set(IntegerT other) const {
+		assert(width == Bits && depth == 1);
+		value<Bits> item;
+		item.template set<IntegerT>(other);
+		std::copy(item.data, item.data + value<Bits>::chunks, next);
 	}
 };
 static_assert(std::is_standard_layout<debug_item>::value, "debug_item is not compatible with C layout");
@@ -947,13 +1069,25 @@ struct debug_items {
 	}
 };
 
+// Tag class to disambiguate the default constructor used by the toplevel module that calls reset(),
+// and the constructor of interior modules that should not call it.
+struct interior {};
+
 struct module {
 	module() {}
 	virtual ~module() {}
 
+	// Modules with black boxes cannot be copied. Although not all designs include black boxes,
+	// delete the copy constructor and copy assignment operator to make sure that any downstream
+	// code that manipulates modules doesn't accidentally depend on their availability.
 	module(const module &) = delete;
 	module &operator=(const module &) = delete;
 
+	module(module &&) = default;
+	module &operator=(module &&) = default;
+
+	virtual void reset() = 0;
+
 	virtual bool eval() = 0;
 	virtual bool commit() = 0;
 
@@ -974,11 +1108,16 @@ struct module {
 
 } // namespace cxxrtl
 
-// Internal structure used to communicate with the implementation of the C interface.
+// Internal structures used to communicate with the implementation of the C interface.
+
 typedef struct _cxxrtl_toplevel {
 	std::unique_ptr<cxxrtl::module> module;
 } *cxxrtl_toplevel;
 
+typedef struct _cxxrtl_outline {
+	std::function<void()> eval;
+} *cxxrtl_outline;
+
 // Definitions of internal Yosys cells. Other than the functions in this namespace, CXXRTL is fully generic
 // and indepenent of Yosys implementation details.
 //
@@ -1112,49 +1251,49 @@ value<BitsY> xnor_ss(const value<BitsA> &a, const value<BitsB> &b) {
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template zcast<BitsY>().template shl(b);
+	return a.template zcast<BitsY>().shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_su(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template scast<BitsY>().template shl(b);
+	return a.template scast<BitsY>().shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template zcast<BitsY>().template shl(b);
+	return a.template zcast<BitsY>().shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_su(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template scast<BitsY>().template shl(b);
+	return a.template scast<BitsY>().shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template shr(b).template zcast<BitsY>();
+	return a.shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_su(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template shr(b).template scast<BitsY>();
+	return a.shr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template shr(b).template zcast<BitsY>();
+	return a.shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_su(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template sshr(b).template scast<BitsY>();
+	return a.sshr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>