8 files changed, 1453 insertions, 232 deletions
diff --git a/backends/cxxrtl/Makefile.inc b/backends/cxxrtl/Makefile.inc
index f93e65f85..aaa304502 100644
--- a/backends/cxxrtl/Makefile.inc
+++ b/backends/cxxrtl/Makefile.inc
@@ -1,2 +1,2 @@
 
-OBJS += backends/cxxrtl/cxxrtl.o
+OBJS += backends/cxxrtl/cxxrtl_backend.o
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index 701510b7f..f0d7b9fc7 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -17,6 +17,11 @@
  */
 
 // This file is included by the designs generated with `write_cxxrtl`. It is not used in Yosys itself.
+//
+// The CXXRTL support library implements compile time specialized arbitrary width arithmetics, as well as provides
+// composite lvalues made out of bit slices and concatenations of lvalues. This allows the `write_cxxrtl` pass
+// to perform a straightforward translation of RTLIL structures to readable C++, relying on the C++ compiler
+// to unwrap the abstraction and generate efficient code.
 
 #ifndef CXXRTL_H
 #define CXXRTL_H
@@ -33,13 +38,24 @@
 #include <memory>
 #include <sstream>
 
-// The cxxrtl support library implements compile time specialized arbitrary width arithmetics, as well as provides
-// composite lvalues made out of bit slices and concatenations of lvalues. This allows the `write_cxxrtl` pass
-// to perform a straightforward translation of RTLIL structures to readable C++, relying on the C++ compiler
-// to unwrap the abstraction and generate efficient code.
+#include <backends/cxxrtl/cxxrtl_capi.h>
+
+// CXXRTL essentially uses the C++ compiler as a hygienic macro engine that feeds an instruction selector.
+// It generates a lot of specialized template functions with relatively large bodies that, when inlined
+// into the caller and (for those with loops) unrolled, often expose many new optimization opportunities.
+// Because of this, most of the CXXRTL runtime must be always inlined for best performance.
+#ifndef __has_attribute
+#	define __has_attribute(x) 0
+#endif
+#if __has_attribute(always_inline)
+#define CXXRTL_ALWAYS_INLINE inline __attribute__((__always_inline__))
+#else
+#define CXXRTL_ALWAYS_INLINE inline
+#endif
+
 namespace cxxrtl {
 
-// All arbitrary-width values in cxxrtl are backed by arrays of unsigned integers called chunks. The chunk size
+// All arbitrary-width values in CXXRTL are backed by arrays of unsigned integers called chunks. The chunk size
 // is the same regardless of the value width to simplify manipulating values via FFI interfaces, e.g. driving
 // and introspecting the simulation in Python.
 //
@@ -49,6 +65,9 @@ namespace cxxrtl {
 // invisible to the compiler, (b) we often operate on non-power-of-2 values and have to clear the high bits anyway.
 // Therefore, using relatively wide chunks and clearing the high bits explicitly and only when we know they may be
 // clobbered results in simpler generated code.
+typedef uint32_t chunk_t;
+typedef uint64_t wide_chunk_t;
+
 template<typename T>
 struct chunk_traits {
 	static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value,
@@ -65,7 +84,7 @@ template<size_t Bits>
 struct value : public expr_base<value<Bits>> {
 	static constexpr size_t bits = Bits;
 
-	using chunk = chunk_traits<uint32_t>;
+	using chunk = chunk_traits<chunk_t>;
 	static constexpr chunk::type msb_mask = (Bits % chunk::bits == 0) ? chunk::mask
 		: chunk::mask >> (chunk::bits - (Bits % chunk::bits));
 
@@ -81,6 +100,7 @@ struct value : public expr_base<value<Bits>> {
 	value<Bits> &operator=(const value<Bits> &) = default;
 
 	// A (no-op) helper that forces the cast to value<>.
+	CXXRTL_ALWAYS_INLINE
 	const value<Bits> &val() const {
 		return *this;
 	}
@@ -91,12 +111,42 @@ struct value : public expr_base<value<Bits>> {
 		return ss.str();
 	}
 
+	// Conversion operations.
+	//
+	// These functions ensure that a conversion is never out of range, and should be always used, if at all
+	// possible, instead of direct manipulation of the `data` member. For very large types, .slice() and
+	// .concat() can be used to split them into more manageable parts.
+	template<class IntegerT>
+	CXXRTL_ALWAYS_INLINE
+	IntegerT get() const {
+		static_assert(std::numeric_limits<IntegerT>::is_integer && !std::numeric_limits<IntegerT>::is_signed,
+		              "get<T>() requires T to be an unsigned integral type");
+		static_assert(std::numeric_limits<IntegerT>::digits >= Bits,
+		              "get<T>() requires T to be at least as wide as the value is");
+		IntegerT result = 0;
+		for (size_t n = 0; n < chunks; n++)
+			result |= IntegerT(data[n]) << (n * chunk::bits);
+		return result;
+	}
+
+	template<class IntegerT>
+	CXXRTL_ALWAYS_INLINE
+	void set(IntegerT other) {
+		static_assert(std::numeric_limits<IntegerT>::is_integer && !std::numeric_limits<IntegerT>::is_signed,
+		              "set<T>() requires T to be an unsigned integral type");
+		static_assert(std::numeric_limits<IntegerT>::digits >= Bits,
+		              "set<T>() requires the value to be at least as wide as T is");
+		for (size_t n = 0; n < chunks; n++)
+			data[n] = (other >> (n * chunk::bits)) & chunk::mask;
+	}
+
 	// Operations with compile-time parameters.
 	//
 	// These operations are used to implement slicing, concatenation, and blitting.
 	// The trunc, zext and sext operations add or remove most significant bits (i.e. on the left);
 	// the rtrunc and rzext operations add or remove least significant bits (i.e. on the right).
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> trunc() const {
 		static_assert(NewBits <= Bits, "trunc() may not increase width");
 		value<NewBits> result;
@@ -107,6 +157,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> zext() const {
 		static_assert(NewBits >= Bits, "zext() may not decrease width");
 		value<NewBits> result;
@@ -116,6 +167,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> sext() const {
 		static_assert(NewBits >= Bits, "sext() may not decrease width");
 		value<NewBits> result;
@@ -131,6 +183,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> rtrunc() const {
 		static_assert(NewBits <= Bits, "rtrunc() may not increase width");
 		value<NewBits> result;
@@ -150,6 +203,7 @@ struct value : public expr_base<value<Bits>> {
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> rzext() const {
 		static_assert(NewBits >= Bits, "rzext() may not decrease width");
 		value<NewBits> result;
@@ -161,13 +215,14 @@ struct value : public expr_base<value<Bits>> {
 			carry = (shift_bits == 0) ? 0
 				: data[n] >> (chunk::bits - shift_bits);
 		}
-		if (carry != 0)
-			result.data[result.chunks - 1] = carry;
+		if (shift_chunks + chunks < result.chunks)
+			result.data[shift_chunks + chunks] = carry;
 		return result;
 	}
 
 	// Bit blit operation, i.e. a partial read-modify-write.
 	template<size_t Stop, size_t Start>
+	CXXRTL_ALWAYS_INLINE
 	value<Bits> blit(const value<Stop - Start + 1> &source) const {
 		static_assert(Stop >= Start, "blit() may not reverse bit order");
 		constexpr chunk::type start_mask = ~(chunk::mask << (Start % chunk::bits));
@@ -192,6 +247,7 @@ struct value : public expr_base<value<Bits>> {
 	// than the operand. In C++17 these can be replaced with `if constexpr`.
 	template<size_t NewBits, typename = void>
 	struct zext_cast {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template zext<NewBits>();
 		}
@@ -199,6 +255,7 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits>
 	struct zext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template trunc<NewBits>();
 		}
@@ -206,6 +263,7 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits, typename = void>
 	struct sext_cast {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template sext<NewBits>();
 		}
@@ -213,17 +271,20 @@ struct value : public expr_base<value<Bits>> {
 
 	template<size_t NewBits>
 	struct sext_cast<NewBits, typename std::enable_if<(NewBits < Bits)>::type> {
+		CXXRTL_ALWAYS_INLINE
 		value<NewBits> operator()(const value<Bits> &val) {
 			return val.template trunc<NewBits>();
 		}
 	};
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> zcast() const {
 		return zext_cast<NewBits>()(*this);
 	}
 
 	template<size_t NewBits>
+	CXXRTL_ALWAYS_INLINE
 	value<NewBits> scast() const {
 		return sext_cast<NewBits>()(*this);
 	}
@@ -242,6 +303,10 @@ struct value : public expr_base<value<Bits>> {
 		data[offset_chunks] |= value ? 1 << offset_bits : 0;
 	}
 
+	explicit operator bool() const {
+		return !is_zero();
+	}
+
 	bool is_zero() const {
 		for (size_t n = 0; n < chunks; n++)
 			if (data[n] != 0)
@@ -249,10 +314,6 @@ struct value : public expr_base<value<Bits>> {
 		return true;
 	}
 
-	explicit operator bool() const {
-		return !is_zero();
-	}
-
 	bool is_neg() const {
 		return data[chunks - 1] & (1 << ((Bits - 1) % chunk::bits));
 	}
@@ -345,10 +406,12 @@ struct value : public expr_base<value<Bits>> {
 				: data[chunks - 1 - n] << (chunk::bits - shift_bits);
 		}
 		if (Signed && is_neg()) {
-			for (size_t n = chunks - shift_chunks; n < chunks; n++)
+			size_t top_chunk_idx  = (Bits - shift_bits) / chunk::bits;
+			size_t top_chunk_bits = (Bits - shift_bits) % chunk::bits;
+			for (size_t n = top_chunk_idx + 1; n < chunks; n++)
 				result.data[n] = chunk::mask;
 			if (shift_bits != 0)
-				result.data[chunks - shift_chunks] |= chunk::mask << (chunk::bits - shift_bits);
+				result.data[top_chunk_idx] |= chunk::mask << top_chunk_bits;
 		}
 		return result;
 	}
@@ -421,6 +484,24 @@ struct value : public expr_base<value<Bits>> {
 		bool overflow = (is_neg() == !other.is_neg()) && (is_neg() != result.is_neg());
 		return result.is_neg() ^ overflow; // a.scmp(b) ≡ a s< b
 	}
+
+	template<size_t ResultBits>
+	value<ResultBits> mul(const value<Bits> &other) const {
+		value<ResultBits> result;
+		wide_chunk_t wide_result[result.chunks + 1] = {};
+		for (size_t n = 0; n < chunks; n++) {
+			for (size_t m = 0; m < chunks && n + m < result.chunks; m++) {
+				wide_result[n + m] += wide_chunk_t(data[n]) * wide_chunk_t(other.data[m]);
+				wide_result[n + m + 1] += wide_result[n + m] >> chunk::bits;
+				wide_result[n + m] &= chunk::mask;
+			}
+		}
+		for (size_t n = 0; n < result.chunks; n++) {
+			result.data[n] = wide_result[n];
+		}
+		result.data[result.chunks - 1] &= result.msb_mask;
+		return result;
+	}
 };
 
 // Expression template for a slice, usable as lvalue or rvalue, and composable with other expression templates here.
@@ -435,12 +516,14 @@ struct slice_expr : public expr_base<slice_expr<T, Stop, Start>> {
 	slice_expr(T &expr) : expr(expr) {}
 	slice_expr(const slice_expr<T, Stop, Start> &) = delete;
 
+	CXXRTL_ALWAYS_INLINE
 	operator value<bits>() const {
 		return static_cast<const value<T::bits> &>(expr)
 			.template rtrunc<T::bits - Start>()
 			.template trunc<bits>();
 	}
 
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<T, Stop, Start> &operator=(const value<bits> &rhs) {
 		// Generic partial assignment implemented using a read-modify-write operation on the sliced expression.
 		expr = static_cast<const value<T::bits> &>(expr)
@@ -449,6 +532,7 @@ struct slice_expr : public expr_base<slice_expr<T, Stop, Start>> {
 	}
 
 	// A helper that forces the cast to value<>, which allows deduction to work.
+	CXXRTL_ALWAYS_INLINE
 	value<bits> val() const {
 		return static_cast<const value<bits> &>(*this);
 	}
@@ -465,6 +549,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 	concat_expr(T &ms_expr, U &ls_expr) : ms_expr(ms_expr), ls_expr(ls_expr) {}
 	concat_expr(const concat_expr<T, U> &) = delete;
 
+	CXXRTL_ALWAYS_INLINE
 	operator value<bits>() const {
 		value<bits> ms_shifted = static_cast<const value<T::bits> &>(ms_expr)
 			.template rzext<bits>();
@@ -473,6 +558,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 		return ms_shifted.bit_or(ls_extended);
 	}
 
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<T, U> &operator=(const value<bits> &rhs) {
 		ms_expr = rhs.template rtrunc<T::bits>();
 		ls_expr = rhs.template trunc<U::bits>();
@@ -480,6 +566,7 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 	}
 
 	// A helper that forces the cast to value<>, which allows deduction to work.
+	CXXRTL_ALWAYS_INLINE
 	value<bits> val() const {
 		return static_cast<const value<bits> &>(*this);
 	}
@@ -504,21 +591,25 @@ struct concat_expr : public expr_base<concat_expr<T, U>> {
 template<class T>
 struct expr_base {
 	template<size_t Stop, size_t Start = Stop>
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<const T, Stop, Start> slice() const {
 		return {*static_cast<const T *>(this)};
 	}
 
 	template<size_t Stop, size_t Start = Stop>
+	CXXRTL_ALWAYS_INLINE
 	slice_expr<T, Stop, Start> slice() {
 		return {*static_cast<T *>(this)};
 	}
 
 	template<class U>
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<const T, typename std::remove_reference<const U>::type> concat(const U &other) const {
 		return {*static_cast<const T *>(this), other};
 	}
 
 	template<class U>
+	CXXRTL_ALWAYS_INLINE
 	concat_expr<T, typename std::remove_reference<U>::type> concat(U &&other) {
 		return {*static_cast<T *>(this), other};
 	}
@@ -559,6 +650,18 @@ struct wire {
 	wire(wire<Bits> &&) = default;
 	wire<Bits> &operator=(const wire<Bits> &) = delete;
 
+	template<class IntegerT>
+	CXXRTL_ALWAYS_INLINE
+	IntegerT get() const {
+		return curr.template get<IntegerT>();
+	}
+
+	template<class IntegerT>
+	CXXRTL_ALWAYS_INLINE
+	void set(IntegerT other) {
+		next.template set<IntegerT>(other);
+	}
+
 	bool commit() {
 		if (curr != next) {
 			curr = next;
@@ -604,6 +707,7 @@ struct memory {
 		// This utterly reprehensible construct is the most reasonable way to apply a function to every element
 		// of a parameter pack, if the elements all have different types and so cannot be cast to an initializer list.
 		auto _ = {std::move(std::begin(init.data), std::end(init.data), data.begin() + init.offset)...};
+		(void)_;
 	}
 
 	// An operator for direct memory reads. May be used at any time during the simulation.
@@ -672,10 +776,8 @@ struct metadata {
 
 	// In debug mode, using the wrong .as_*() function will assert.
 	// In release mode, using the wrong .as_*() function will safely return a default value.
-	union {
-		const unsigned  uint_value = 0;
-		const signed    sint_value;
-	};
+	const unsigned    uint_value = 0;
+	const signed      sint_value = 0;
 	const std::string string_value = "";
 	const double      double_value = 0.0;
 
@@ -712,6 +814,139 @@ struct metadata {
 
 typedef std::map<std::string, metadata> metadata_map;
 
+// Helper class to disambiguate values/wires and their aliases.
+struct debug_alias {};
+
+// This structure is intended for consumption via foreign function interfaces, like Python's ctypes.
+// Because of this it uses a C-style layout that is easy to parse rather than more idiomatic C++.
+//
+// To avoid violating strict aliasing rules, this structure has to be a subclass of the one used
+// in the C API, or it would not be possible to cast between the pointers to these.
+struct debug_item : ::cxxrtl_object {
+	enum : uint32_t {
+		VALUE  = CXXRTL_VALUE,
+		WIRE   = CXXRTL_WIRE,
+		MEMORY = CXXRTL_MEMORY,
+		ALIAS  = CXXRTL_ALIAS,
+	};
+
+	debug_item(const ::cxxrtl_object &object) : cxxrtl_object(object) {}
+
+	template<size_t Bits>
+	debug_item(value<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
+		              "value<Bits> is not compatible with C layout");
+		type    = VALUE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = item.data;
+		next    = item.data;
+	}
+
+	template<size_t Bits>
+	debug_item(const value<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
+		              "value<Bits> is not compatible with C layout");
+		type    = VALUE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.data);
+		next    = nullptr;
+	}
+
+	template<size_t Bits>
+	debug_item(wire<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item.curr) == value<Bits>::chunks * sizeof(chunk_t) &&
+		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
+		              "wire<Bits> is not compatible with C layout");
+		type    = WIRE;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = item.curr.data;
+		next    = item.next.data;
+	}
+
+	template<size_t Width>
+	debug_item(memory<Width> &item, size_t zero_offset = 0) {
+		static_assert(sizeof(item.data[0]) == value<Width>::chunks * sizeof(chunk_t),
+		              "memory<Width> is not compatible with C layout");
+		type    = MEMORY;
+		width   = Width;
+		lsb_at  = 0;
+		depth   = item.data.size();
+		zero_at = zero_offset;
+		curr    = item.data.empty() ? nullptr : item.data[0].data;
+		next    = nullptr;
+	}
+
+	template<size_t Bits>
+	debug_item(debug_alias, const value<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item) == value<Bits>::chunks * sizeof(chunk_t),
+		              "value<Bits> is not compatible with C layout");
+		type    = ALIAS;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.data);
+		next    = nullptr;
+	}
+
+	template<size_t Bits>
+	debug_item(debug_alias, const wire<Bits> &item, size_t lsb_offset = 0) {
+		static_assert(sizeof(item.curr) == value<Bits>::chunks * sizeof(chunk_t) &&
+		              sizeof(item.next) == value<Bits>::chunks * sizeof(chunk_t),
+		              "wire<Bits> is not compatible with C layout");
+		type    = ALIAS;
+		width   = Bits;
+		lsb_at  = lsb_offset;
+		depth   = 1;
+		zero_at = 0;
+		curr    = const_cast<chunk_t*>(item.curr.data);
+		next    = nullptr;
+	}
+};
+static_assert(std::is_standard_layout<debug_item>::value, "debug_item is not compatible with C layout");
+
+struct debug_items {
+	std::map<std::string, std::vector<debug_item>> table;
+
+	void add(const std::string &name, debug_item &&item) {
+		std::vector<debug_item> &parts = table[name];
+		parts.emplace_back(item);
+		std::sort(parts.begin(), parts.end(),
+			[](const debug_item &a, const debug_item &b) {
+				return a.lsb_at < b.lsb_at;
+			});
+	}
+
+	size_t count(const std::string &name) const {
+		if (table.count(name) == 0)
+			return 0;
+		return table.at(name).size();
+	}
+
+	const std::vector<debug_item> &parts_at(const std::string &name) const {
+		return table.at(name);
+	}
+
+	const debug_item &at(const std::string &name) const {
+		const std::vector<debug_item> &parts = table.at(name);
+		assert(parts.size() == 1);
+		return parts.at(0);
+	}
+
+	const debug_item &operator [](const std::string &name) const {
+		return at(name);
+	}
+};
+
 struct module {
 	module() {}
 	virtual ~module() {}
@@ -731,11 +966,20 @@ struct module {
 		} while (commit() && !converged);
 		return deltas;
 	}
+
+	virtual void debug_info(debug_items &items, std::string path = "") {
+		(void)items, (void)path;
+	}
 };
 
 } // namespace cxxrtl
 
-// Definitions of internal Yosys cells. Other than the functions in this namespace, cxxrtl is fully generic
+// Internal structure used to communicate with the implementation of the C interface.
+typedef struct _cxxrtl_toplevel {
+	std::unique_ptr<cxxrtl::module> module;
+} *cxxrtl_toplevel;
+
+// Definitions of internal Yosys cells. Other than the functions in this namespace, CXXRTL is fully generic
 // and indepenent of Yosys implementation details.
 //
 // The `write_cxxrtl` pass translates internal cells (cells with names that start with `$`) to calls of these
@@ -749,309 +993,322 @@ using namespace cxxrtl;
 
 // std::max isn't constexpr until C++14 for no particular reason (it's an oversight), so we define our own.
 template<class T>
+CXXRTL_ALWAYS_INLINE
 constexpr T max(const T &a, const T &b) {
 	return a > b ? a : b;
 }
 
 // Logic operations
 template<size_t BitsY, size_t BitsA>
-value<BitsY> not_u(const value<BitsA> &a) {
-	return a.template zcast<BitsY>().bit_not();
-}
-
-template<size_t BitsY, size_t BitsA>
-value<BitsY> not_s(const value<BitsA> &a) {
-	return a.template scast<BitsY>().bit_not();
-}
-
-template<size_t BitsY, size_t BitsA>
-value<BitsY> logic_not_u(const value<BitsA> &a) {
+CXXRTL_ALWAYS_INLINE
+value<BitsY> logic_not(const value<BitsA> &a) {
 	return value<BitsY> { a ? 0u : 1u };
 }
 
-template<size_t BitsY, size_t BitsA>
-value<BitsY> logic_not_s(const value<BitsA> &a) {
-	return value<BitsY> { a ? 0u : 1u };
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
+value<BitsY> logic_and(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) && bool(b)) ? 1u : 0u };
 }
 
-template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_and_u(const value<BitsA> &a) {
-	return value<BitsY> { a.bit_not().is_zero() ? 1u : 0u };
+template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
+value<BitsY> logic_or(const value<BitsA> &a, const value<BitsB> &b) {
+	return value<BitsY> { (bool(a) || bool(b)) ? 1u : 0u };
 }
 
+// Reduction operations
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_and_s(const value<BitsA> &a) {
+CXXRTL_ALWAYS_INLINE
+value<BitsY> reduce_and(const value<BitsA> &a) {
 	return value<BitsY> { a.bit_not().is_zero() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_or_u(const value<BitsA> &a) {
-	return value<BitsY> { a ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_or_s(const value<BitsA> &a) {
+CXXRTL_ALWAYS_INLINE
+value<BitsY> reduce_or(const value<BitsA> &a) {
 	return value<BitsY> { a ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_xor_u(const value<BitsA> &a) {
-	return value<BitsY> { (a.ctpop() % 2) ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_xor_s(const value<BitsA> &a) {
+CXXRTL_ALWAYS_INLINE
+value<BitsY> reduce_xor(const value<BitsA> &a) {
 	return value<BitsY> { (a.ctpop() % 2) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_xnor_u(const value<BitsA> &a) {
+CXXRTL_ALWAYS_INLINE
+value<BitsY> reduce_xnor(const value<BitsA> &a) {
 	return value<BitsY> { (a.ctpop() % 2) ? 0u : 1u };
 }
 
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_xnor_s(const value<BitsA> &a) {
-	return value<BitsY> { (a.ctpop() % 2) ? 0u : 1u };
+CXXRTL_ALWAYS_INLINE
+value<BitsY> reduce_bool(const value<BitsA> &a) {
+	return value<BitsY> { a ? 1u : 0u };
 }
 
+// Bitwise operations
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_bool_u(const value<BitsA> &a) {
-	return value<BitsY> { a ? 1u : 0u };
+CXXRTL_ALWAYS_INLINE
+value<BitsY> not_u(const value<BitsA> &a) {
+	return a.template zcast<BitsY>().bit_not();
 }
 
 template<size_t BitsY, size_t BitsA>
-value<BitsY> reduce_bool_s(const value<BitsA> &a) {
-	return value<BitsY> { a ? 1u : 0u };
+CXXRTL_ALWAYS_INLINE
+value<BitsY> not_s(const value<BitsA> &a) {
+	return a.template scast<BitsY>().bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> and_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_and(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> and_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_and(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> or_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_or(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> or_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_or(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xor_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xor_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xnor_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().bit_xor(b.template zcast<BitsY>()).bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> xnor_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().bit_xor(b.template scast<BitsY>()).bit_not();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
-value<BitsY> logic_and_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return value<BitsY> { (bool(a) & bool(b)) ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA, size_t BitsB>
-value<BitsY> logic_and_ss(const value<BitsA> &a, const value<BitsB> &b) {
-	return value<BitsY> { (bool(a) & bool(b)) ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA, size_t BitsB>
-value<BitsY> logic_or_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	return value<BitsY> { (bool(a) | bool(b)) ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA, size_t BitsB>
-value<BitsY> logic_or_ss(const value<BitsA> &a, const value<BitsB> &b) {
-	return value<BitsY> { (bool(a) | bool(b)) ? 1u : 0u };
-}
-
-template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shl_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshl_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().template shl(b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shr_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template shr(b).template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sshr_su(const value<BitsA> &a, const value<BitsB> &b) {
-	return a.template shr(b).template scast<BitsY>();
+	return a.template sshr(b).template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return shr_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return shr_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_us(const value<BitsA> &a, const value<BitsB> &b) {
 	return b.is_neg() ? shl_uu<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shift_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return b.is_neg() ? shl_su<BitsY>(a, b.template sext<BitsB + 1>().neg()) : shr_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_su(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_su<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_us(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_us<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> shiftx_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return shift_ss<BitsY>(a, b);
 }
 
 // Comparison operations
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eq_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template zext<BitsExt>() == b.template zext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eq_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template sext<BitsExt>() == b.template sext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ne_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template zext<BitsExt>() != b.template zext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ne_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY>{ a.template sext<BitsExt>() != b.template sext<BitsExt>() ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eqx_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return eq_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> eqx_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return eq_ss<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> nex_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return ne_uu<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> nex_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return ne_ss<BitsY>(a, b);
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> gt_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> gt_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ge_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> ge_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> lt_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { a.template zext<BitsExt>().ucmp(b.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> lt_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { a.template sext<BitsExt>().scmp(b.template sext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> le_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !b.template zext<BitsExt>().ucmp(a.template zext<BitsExt>()) ? 1u : 0u };
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> le_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t BitsExt = max(BitsA, BitsB);
 	return value<BitsY> { !b.template sext<BitsExt>().scmp(a.template sext<BitsExt>()) ? 1u : 0u };
@@ -1059,71 +1316,68 @@ value<BitsY> le_ss(const value<BitsA> &a, const value<BitsB> &b) {
 
 // Arithmetic operations
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> pos_u(const value<BitsA> &a) {
 	return a.template zcast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> pos_s(const value<BitsA> &a) {
 	return a.template scast<BitsY>();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> neg_u(const value<BitsA> &a) {
 	return a.template zcast<BitsY>().neg();
 }
 
 template<size_t BitsY, size_t BitsA>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> neg_s(const value<BitsA> &a) {
 	return a.template scast<BitsY>().neg();
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> add_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().add(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> add_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().add(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sub_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template zcast<BitsY>().sub(b.template zcast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> sub_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return a.template scast<BitsY>().sub(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsY> product;
-	value<BitsY> multiplicand = a.template zcast<BitsY>();
-	const value<BitsB> &multiplier = b;
-	uint32_t multiplicand_shift = 0;
-	for (size_t step = 0; step < BitsB; step++) {
-		if (multiplier.bit(step)) {
-			multiplicand = multiplicand.shl(value<32> { multiplicand_shift });
-			product = product.add(multiplicand);
-			multiplicand_shift = 0;
-		}
-		multiplicand_shift++;
-	}
-	return product;
+	constexpr size_t BitsM = BitsA >= BitsB ? BitsA : BitsB;
+	return a.template zcast<BitsM>().template mul<BitsY>(b.template zcast<BitsM>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_ss(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
-	if (ub.is_neg()) ub = ub.neg();
-	value<BitsY> y = mul_uu<BitsY>(a.template scast<BitsY>(), ub);
-	return b.is_neg() ? y.neg() : y;
+	return a.template scast<BitsY>().template mul<BitsY>(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 std::pair<value<BitsY>, value<BitsY>> divmod_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	constexpr size_t Bits = max(BitsY, max(BitsA, BitsB));
 	value<Bits> quotient;
@@ -1145,6 +1399,7 @@ std::pair<value<BitsY>, value<BitsY>> divmod_uu(const value<BitsA> &a, const val
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 std::pair<value<BitsY>, value<BitsY>> divmod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	value<BitsA + 1> ua = a.template sext<BitsA + 1>();
 	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
@@ -1158,21 +1413,25 @@ std::pair<value<BitsY>, value<BitsY>> divmod_ss(const value<BitsA> &a, const val
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> div_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_uu<BitsY>(a, b).first;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> div_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_ss<BitsY>(a, b).first;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mod_uu(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_uu<BitsY>(a, b).second;
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
+CXXRTL_ALWAYS_INLINE
 value<BitsY> mod_ss(const value<BitsA> &a, const value<BitsB> &b) {
 	return divmod_ss<BitsY>(a, b).second;
 }
diff --git a/backends/cxxrtl/cxxrtl.cc b/backends/cxxrtl/cxxrtl_backend.cc
index f3ed3f623..5e5ba5ac0 100644
--- a/backends/cxxrtl/cxxrtl.cc
+++ b/backends/cxxrtl/cxxrtl_backend.cc
@@ -171,11 +171,6 @@ struct Scheduler {
 	}
 };
 
-bool is_input_wire(const RTLIL::Wire *wire)
-{
-	return wire->port_input && !wire->port_output;
-}
-
 bool is_unary_cell(RTLIL::IdString type)
 {
 	return type.in(
@@ -192,22 +187,29 @@ bool is_binary_cell(RTLIL::IdString type)
 		ID($add), ID($sub), ID($mul), ID($div), ID($mod));
 }
 
+bool is_extending_cell(RTLIL::IdString type)
+{
+	return !type.in(
+		ID($logic_not), ID($logic_and), ID($logic_or),
+		ID($reduce_and), ID($reduce_or), ID($reduce_xor), ID($reduce_xnor), ID($reduce_bool));
+}
+
 bool is_elidable_cell(RTLIL::IdString type)
 {
 	return is_unary_cell(type) || is_binary_cell(type) || type.in(
-		ID($mux), ID($concat), ID($slice));
+		ID($mux), ID($concat), ID($slice), ID($pmux));
 }
 
 bool is_sync_ff_cell(RTLIL::IdString type)
 {
 	return type.in(
-		ID($dff), ID($dffe));
+		ID($dff), ID($dffe), ID($sdff), ID($sdffe), ID($sdffce));
 }
 
 bool is_ff_cell(RTLIL::IdString type)
 {
 	return is_sync_ff_cell(type) || type.in(
-		ID($adff), ID($dffsr), ID($dlatch), ID($dlatchsr), ID($sr));
+		ID($adff), ID($adffe), ID($dffsr), ID($dffsre), ID($dlatch), ID($adlatch), ID($dlatchsr), ID($sr));
 }
 
 bool is_internal_cell(RTLIL::IdString type)
@@ -359,10 +361,10 @@ struct FlowGraph {
 		//
 		// eliminating the unnecessary delta cycle. Conceptually, the CELL_SYNC node type is a series of
 		// connections of the form `connect \lhs \cell.\sync_output`; the right-hand side of these is not
-		// as a wire in RTLIL. If it was expressible, then `\cell.\sync_output` would have a sync def,
-		// and this node would be an ordinary CONNECT node, with `\lhs` having a comb def. Because it isn't,
-		// a special node type is used, the right-hand side does not appear anywhere, and the left-hand
-		// side has a comb def.
+		// expressible as a wire in RTLIL. If it was expressible, then `\cell.\sync_output` would have
+		// a sync def, and this node would be an ordinary CONNECT node, with `\lhs` having a comb def.
+		// Because it isn't, a special node type is used, the right-hand side does not appear anywhere,
+		// and the left-hand side has a comb def.
 		for (auto conn : cell->connections())
 			if (cell->output(conn.first))
 				if (is_cxxrtl_sync_port(cell, conn.first)) {
@@ -467,14 +469,16 @@ std::vector<std::string> split_by(const std::string &str, const std::string &sep
 	std::vector<std::string> result;
 	size_t prev = 0;
 	while (true) {
-		size_t curr = str.find_first_of(sep, prev + 1);
-		if (curr > str.size())
-			curr = str.size();
-		if (curr > prev + 1)
-			result.push_back(str.substr(prev, curr - prev));
-		if (curr == str.size())
+		size_t curr = str.find_first_of(sep, prev);
+		if (curr == std::string::npos) {
+			std::string part = str.substr(prev);
+			if (!part.empty()) result.push_back(part);
 			break;
-		prev = curr;
+		} else {
+			std::string part = str.substr(prev, curr - prev);
+			if (!part.empty()) result.push_back(part);
+			prev = curr + 1;
+		}
 	}
 	return result;
 }
@@ -502,6 +506,15 @@ std::string escape_cxx_string(const std::string &input)
 	return output;
 }
 
+template<class T>
+std::string get_hdl_name(T *object)
+{
+	if (object->has_attribute(ID::hdlname))
+		return object->get_string_attribute(ID::hdlname);
+	else
+		return object->name.str().substr(1);
+}
+
 struct CxxrtlWorker {
 	bool split_intf = false;
 	std::string intf_filename;
@@ -509,13 +522,17 @@ struct CxxrtlWorker {
 	std::ostream *impl_f = nullptr;
 	std::ostream *intf_f = nullptr;
 
-	bool elide_internal = false;
-	bool elide_public = false;
+	bool run_flatten = false;
+	bool run_proc = false;
+
+	bool unbuffer_internal = false;
+	bool unbuffer_public = false;
 	bool localize_internal = false;
 	bool localize_public = false;
-	bool run_opt_clean_purge = false;
-	bool run_proc_flatten = false;
-	bool max_opt_level = false;
+	bool elide_internal = false;
+	bool elide_public = false;
+
+	bool debug_info = false;
 
 	std::ostringstream f;
 	std::string indent;
@@ -528,7 +545,10 @@ struct CxxrtlWorker {
 	dict<const RTLIL::Cell*, pool<const RTLIL::Cell*>> transparent_for;
 	dict<const RTLIL::Wire*, FlowGraph::Node> elided_wires;
 	dict<const RTLIL::Module*, std::vector<FlowGraph::Node>> schedule;
+	pool<const RTLIL::Wire*> unbuffered_wires;
 	pool<const RTLIL::Wire*> localized_wires;
+	dict<const RTLIL::Wire*, const RTLIL::Wire*> debug_alias_wires;
+	dict<const RTLIL::Wire*, RTLIL::Const> debug_const_wires;
 	dict<const RTLIL::Module*, pool<std::string>> blackbox_specializations;
 	dict<const RTLIL::Module*, bool> eval_converges;
 
@@ -765,7 +785,8 @@ struct CxxrtlWorker {
 			dump_const(chunk.data, chunk.width, chunk.offset);
 			return false;
 		} else {
-			if (!is_lhs && elided_wires.count(chunk.wire)) {
+			if (elided_wires.count(chunk.wire)) {
+				log_assert(!is_lhs);
 				const FlowGraph::Node &node = elided_wires[chunk.wire];
 				switch (node.type) {
 					case FlowGraph::Node::Type::CONNECT:
@@ -778,7 +799,7 @@ struct CxxrtlWorker {
 					default:
 						log_assert(false);
 				}
-			} else if (localized_wires[chunk.wire] || is_input_wire(chunk.wire)) {
+			} else if (unbuffered_wires[chunk.wire]) {
 				f << mangle(chunk.wire);
 			} else {
 				f << mangle(chunk.wire) << (is_lhs ? ".next" : ".curr");
@@ -895,17 +916,19 @@ struct CxxrtlWorker {
 	{
 		// Unary cells
 		if (is_unary_cell(cell->type)) {
-			f << cell->type.substr(1) << '_' <<
-			     (cell->getParam(ID::A_SIGNED).as_bool() ? 's' : 'u') <<
-			     "<" << cell->getParam(ID::Y_WIDTH).as_int() << ">(";
+			f << cell->type.substr(1);
+			if (is_extending_cell(cell->type))
+				f << '_' << (cell->getParam(ID::A_SIGNED).as_bool() ? 's' : 'u');
+			f << "<" << cell->getParam(ID::Y_WIDTH).as_int() << ">(";
 			dump_sigspec_rhs(cell->getPort(ID::A));
 			f << ")";
 		// Binary cells
 		} else if (is_binary_cell(cell->type)) {
-			f << cell->type.substr(1) << '_' <<
-			     (cell->getParam(ID::A_SIGNED).as_bool() ? 's' : 'u') <<
-			     (cell->getParam(ID::B_SIGNED).as_bool() ? 's' : 'u') <<
-			     "<" << cell->getParam(ID::Y_WIDTH).as_int() << ">(";
+			f << cell->type.substr(1);
+			if (is_extending_cell(cell->type))
+				f << '_' << (cell->getParam(ID::A_SIGNED).as_bool() ? 's' : 'u') <<
+				            (cell->getParam(ID::B_SIGNED).as_bool() ? 's' : 'u');
+			f << "<" << cell->getParam(ID::Y_WIDTH).as_int() << ">(";
 			dump_sigspec_rhs(cell->getPort(ID::A));
 			f << ", ";
 			dump_sigspec_rhs(cell->getPort(ID::B));
@@ -919,6 +942,21 @@ struct CxxrtlWorker {
 			f << " : ";
 			dump_sigspec_rhs(cell->getPort(ID::A));
 			f << ")";
+		// Parallel (one-hot) muxes
+		} else if (cell->type == ID($pmux)) {
+			int width = cell->getParam(ID::WIDTH).as_int();
+			int s_width = cell->getParam(ID::S_WIDTH).as_int();
+			for (int part = 0; part < s_width; part++) {
+				f << "(";
+				dump_sigspec_rhs(cell->getPort(ID::S).extract(part));
+				f << " ? ";
+				dump_sigspec_rhs(cell->getPort(ID::B).extract(part * width, width));
+				f << " : ";
+			}
+			dump_sigspec_rhs(cell->getPort(ID::A));
+			for (int part = 0; part < s_width; part++) {
+				f << ")";
+			}
 		// Concats
 		} else if (cell->type == ID($concat)) {
 			dump_sigspec_rhs(cell->getPort(ID::B));
@@ -985,35 +1023,6 @@ struct CxxrtlWorker {
 			f << " = ";
 			dump_cell_elided(cell);
 			f << ";\n";
-		// Parallel (one-hot) muxes
-		} else if (cell->type == ID($pmux)) {
-			int width = cell->getParam(ID::WIDTH).as_int();
-			int s_width = cell->getParam(ID::S_WIDTH).as_int();
-			bool first = true;
-			for (int part = 0; part < s_width; part++) {
-				f << (first ? indent : " else ");
-				first = false;
-				f << "if (";
-				dump_sigspec_rhs(cell->getPort(ID::S).extract(part));
-				f << ") {\n";
-				inc_indent();
-					f << indent;
-					dump_sigspec_lhs(cell->getPort(ID::Y));
-					f << " = ";
-					dump_sigspec_rhs(cell->getPort(ID::B).extract(part * width, width));
-					f << ";\n";
-				dec_indent();
-				f << indent << "}";
-			}
-			f << " else {\n";
-			inc_indent();
-				f << indent;
-				dump_sigspec_lhs(cell->getPort(ID::Y));
-				f << " = ";
-				dump_sigspec_rhs(cell->getPort(ID::A));
-				f << ";\n";
-			dec_indent();
-			f << indent << "}\n";
 		// Flip-flops
 		} else if (is_ff_cell(cell->type)) {
 			if (cell->hasPort(ID::CLK) && cell->getPort(ID::CLK).is_wire()) {
@@ -1023,7 +1032,7 @@ struct CxxrtlWorker {
 				f << indent << "if (" << (cell->getParam(ID::CLK_POLARITY).as_bool() ? "posedge_" : "negedge_")
 				            << mangle(clk_bit) << ") {\n";
 				inc_indent();
-					if (cell->type == ID($dffe)) {
+					if (cell->hasPort(ID::EN)) {
 						f << indent << "if (";
 						dump_sigspec_rhs(cell->getPort(ID::EN));
 						f << " == value<1> {" << cell->getParam(ID::EN_POLARITY).as_bool() << "u}) {\n";
@@ -1034,7 +1043,24 @@ struct CxxrtlWorker {
 					f << " = ";
 					dump_sigspec_rhs(cell->getPort(ID::D));
 					f << ";\n";
-					if (cell->type == ID($dffe)) {
+					if (cell->hasPort(ID::EN) && cell->type != ID($sdffce)) {
+						dec_indent();
+						f << indent << "}\n";
+					}
+					if (cell->hasPort(ID::SRST)) {
+						f << indent << "if (";
+						dump_sigspec_rhs(cell->getPort(ID::SRST));
+						f << " == value<1> {" << cell->getParam(ID::SRST_POLARITY).as_bool() << "u}) {\n";
+						inc_indent();
+							f << indent;
+							dump_sigspec_lhs(cell->getPort(ID::Q));
+							f << " = ";
+							dump_const(cell->getParam(ID::SRST_VALUE));
+							f << ";\n";
+						dec_indent();
+						f << indent << "}\n";
+					}
+					if (cell->hasPort(ID::EN) && cell->type == ID($sdffce)) {
 						dec_indent();
 						f << indent << "}\n";
 					}
@@ -1125,31 +1151,33 @@ struct CxxrtlWorker {
 				f << indent << "if(" << valid_index_temp << ".valid) {\n";
 				inc_indent();
 					if (writable_memories[memory]) {
-						std::string addr_temp = fresh_temporary();
-						f << indent << "const value<" << cell->getPort(ID::ADDR).size() << "> &" << addr_temp << " = ";
-						dump_sigspec_rhs(cell->getPort(ID::ADDR));
-						f << ";\n";
 						std::string lhs_temp = fresh_temporary();
 						f << indent << "value<" << memory->width << "> " << lhs_temp << " = "
 						            << mangle(memory) << "[" << valid_index_temp << ".index];\n";
 						std::vector<const RTLIL::Cell*> memwr_cells(transparent_for[cell].begin(), transparent_for[cell].end());
-						std::sort(memwr_cells.begin(), memwr_cells.end(),
-							[](const RTLIL::Cell *a, const RTLIL::Cell *b) {
-								return a->getParam(ID::PRIORITY).as_int() < b->getParam(ID::PRIORITY).as_int();
-							});
-						for (auto memwr_cell : memwr_cells) {
-							f << indent << "if (" << addr_temp << " == ";
-							dump_sigspec_rhs(memwr_cell->getPort(ID::ADDR));
-							f << ") {\n";
-							inc_indent();
-								f << indent << lhs_temp << " = " << lhs_temp;
-								f << ".update(";
-								dump_sigspec_rhs(memwr_cell->getPort(ID::DATA));
-								f << ", ";
-								dump_sigspec_rhs(memwr_cell->getPort(ID::EN));
-								f << ");\n";
-							dec_indent();
-							f << indent << "}\n";
+						if (!memwr_cells.empty()) {
+							std::string addr_temp = fresh_temporary();
+							f << indent << "const value<" << cell->getPort(ID::ADDR).size() << "> &" << addr_temp << " = ";
+							dump_sigspec_rhs(cell->getPort(ID::ADDR));
+							f << ";\n";
+							std::sort(memwr_cells.begin(), memwr_cells.end(),
+								[](const RTLIL::Cell *a, const RTLIL::Cell *b) {
+									return a->getParam(ID::PRIORITY).as_int() < b->getParam(ID::PRIORITY).as_int();
+								});
+							for (auto memwr_cell : memwr_cells) {
+								f << indent << "if (" << addr_temp << " == ";
+								dump_sigspec_rhs(memwr_cell->getPort(ID::ADDR));
+								f << ") {\n";
+								inc_indent();
+									f << indent << lhs_temp << " = " << lhs_temp;
+									f << ".update(";
+									dump_sigspec_rhs(memwr_cell->getPort(ID::DATA));
+									f << ", ";
+									dump_sigspec_rhs(memwr_cell->getPort(ID::EN));
+									f << ");\n";
+								dec_indent();
+								f << indent << "}\n";
+							}
 						}
 						f << indent;
 						dump_sigspec_lhs(cell->getPort(ID::DATA));
@@ -1411,13 +1439,12 @@ struct CxxrtlWorker {
 	{
 		if (elided_wires.count(wire))
 			return;
-		if (localized_wires.count(wire) != is_local_context)
-			return;
 
-		if (is_local_context) {
+		if (localized_wires[wire] && is_local_context) {
 			dump_attrs(wire);
 			f << indent << "value<" << wire->width << "> " << mangle(wire) << ";\n";
-		} else {
+		}
+		if (!localized_wires[wire] && !is_local_context) {
 			std::string width;
 			if (wire->module->has_attribute(ID(cxxrtl_blackbox)) && wire->has_attribute(ID(cxxrtl_width))) {
 				width = wire->get_string_attribute(ID(cxxrtl_width));
@@ -1426,14 +1453,21 @@ struct CxxrtlWorker {
 			}
 
 			dump_attrs(wire);
-			f << indent << (is_input_wire(wire) ? "value" : "wire") << "<" << width << "> " << mangle(wire);
+			f << indent;
+			if (wire->port_input && wire->port_output)
+				f << "/*inout*/ ";
+			else if (wire->port_input)
+				f << "/*input*/ ";
+			else if (wire->port_output)
+				f << "/*output*/ ";
+			f << (unbuffered_wires[wire] ? "value" : "wire") << "<" << width << "> " << mangle(wire);
 			if (wire->has_attribute(ID::init)) {
 				f << " ";
 				dump_const_init(wire->attributes.at(ID::init));
 			}
 			f << ";\n";
 			if (edge_wires[wire]) {
-				if (is_input_wire(wire)) {
+				if (unbuffered_wires[wire]) {
 					f << indent << "value<" << width << "> prev_" << mangle(wire);
 					if (wire->has_attribute(ID::init)) {
 						f << " ";
@@ -1444,7 +1478,7 @@ struct CxxrtlWorker {
 				for (auto edge_type : edge_types) {
 					if (edge_type.first.wire == wire) {
 						std::string prev, next;
-						if (is_input_wire(wire)) {
+						if (unbuffered_wires[wire]) {
 							prev = "prev_" + mangle(edge_type.first.wire);
 							next =           mangle(edge_type.first.wire);
 						} else {
@@ -1567,9 +1601,9 @@ struct CxxrtlWorker {
 		inc_indent();
 			f << indent << "bool changed = false;\n";
 			for (auto wire : module->wires()) {
-				if (elided_wires.count(wire) || localized_wires.count(wire))
+				if (elided_wires.count(wire))
 					continue;
-				if (is_input_wire(wire)) {
+				if (unbuffered_wires[wire]) {
 					if (edge_wires[wire])
 						f << indent << "prev_" << mangle(wire) << " = " << mangle(wire) << ";\n";
 					continue;
@@ -1594,6 +1628,72 @@ struct CxxrtlWorker {
 		dec_indent();
 	}
 
+	void dump_debug_info_method(RTLIL::Module *module)
+	{
+		size_t count_public_wires = 0;
+		size_t count_const_wires = 0;
+		size_t count_alias_wires = 0;
+		size_t count_member_wires = 0;
+		size_t count_skipped_wires = 0;
+		inc_indent();
+			f << indent << "assert(path.empty() || path[path.size() - 1] == ' ');\n";
+			for (auto wire : module->wires()) {
+				if (wire->name[0] != '\\')
+					continue;
+				if (module->get_bool_attribute(ID(cxxrtl_blackbox)) && (wire->port_id == 0))
+					continue;
+				count_public_wires++;
+				if (debug_const_wires.count(wire)) {
+					// Wire tied to a constant
+					f << indent << "static const value<" << wire->width << "> const_" << mangle(wire) << " = ";
+					dump_const(debug_const_wires[wire]);
+					f << ";\n";
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(const_" << mangle(wire) << ", ";
+					f << wire->start_offset << "));\n";
+					count_const_wires++;
+				} else if (debug_alias_wires.count(wire)) {
+					// Alias of a member wire
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(debug_alias(), " << mangle(debug_alias_wires[wire]) << ", ";
+					f << wire->start_offset << "));\n";
+					count_alias_wires++;
+				} else if (!localized_wires.count(wire)) {
+					// Member wire
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(wire));
+					f << ", debug_item(" << mangle(wire) << ", ";
+					f << wire->start_offset << "));\n";
+					count_member_wires++;
+				} else {
+					count_skipped_wires++;
+				}
+			}
+			if (!module->get_bool_attribute(ID(cxxrtl_blackbox))) {
+				for (auto &memory_it : module->memories) {
+					if (memory_it.first[0] != '\\')
+						continue;
+					f << indent << "items.add(path + " << escape_cxx_string(get_hdl_name(memory_it.second));
+					f << ", debug_item(" << mangle(memory_it.second) << ", ";
+					f << memory_it.second->start_offset << "));\n";
+				}
+				for (auto cell : module->cells()) {
+					if (is_internal_cell(cell->type))
+						continue;
+					const char *access = is_cxxrtl_blackbox_cell(cell) ? "->" : ".";
+					f << indent << mangle(cell) << access << "debug_info(items, ";
+					f << "path + " << escape_cxx_string(get_hdl_name(cell) + ' ') << ");\n";
+				}
+			}
+		dec_indent();
+
+		log_debug("Debug information statistics for module `%s':\n", log_id(module));
+		log_debug("  Public wires: %zu, of which:\n", count_public_wires);
+		log_debug("    Const wires:  %zu\n", count_const_wires);
+		log_debug("    Alias wires:  %zu\n", count_alias_wires);
+		log_debug("    Member wires: %zu\n", count_member_wires);
+		log_debug("    Other wires:  %zu (no debug information)\n", count_skipped_wires);
+	}
+
 	void dump_metadata_map(const dict<RTLIL::IdString, RTLIL::Const> &metadata_map)
 	{
 		if (metadata_map.empty()) {
@@ -1642,6 +1742,12 @@ struct CxxrtlWorker {
 				dump_commit_method(module);
 				f << indent << "}\n";
 				f << "\n";
+				if (debug_info) {
+					f << indent << "void debug_info(debug_items &items, std::string path = \"\") override {\n";
+					dump_debug_info_method(module);
+					f << indent << "}\n";
+					f << "\n";
+				}
 				f << indent << "static std::unique_ptr<" << mangle(module);
 				f << template_params(module, /*is_decl=*/false) << "> ";
 				f << "create(std::string name, metadata_map parameters, metadata_map attributes);\n";
@@ -1690,7 +1796,7 @@ struct CxxrtlWorker {
 					if (cell_module->get_bool_attribute(ID(cxxrtl_blackbox))) {
 						f << indent << "std::unique_ptr<" << mangle(cell_module) << template_args(cell) << "> ";
 						f << mangle(cell) << " = " << mangle(cell_module) << template_args(cell);
-						f << "::create(" << escape_cxx_string(cell->name.str()) << ", ";
+						f << "::create(" << escape_cxx_string(get_hdl_name(cell)) << ", ";
 						dump_metadata_map(cell->parameters);
 						f << ", ";
 						dump_metadata_map(cell->attributes);
@@ -1704,6 +1810,8 @@ struct CxxrtlWorker {
 					f << "\n";
 				f << indent << "bool eval() override;\n";
 				f << indent << "bool commit() override;\n";
+				if (debug_info)
+					f << indent << "void debug_info(debug_items &items, std::string path = \"\") override;\n";
 			dec_indent();
 			f << indent << "}; // struct " << mangle(module) << "\n";
 			f << "\n";
@@ -1722,10 +1830,17 @@ struct CxxrtlWorker {
 		dump_commit_method(module);
 		f << indent << "}\n";
 		f << "\n";
+		if (debug_info) {
+			f << indent << "void " << mangle(module) << "::debug_info(debug_items &items, std::string path) {\n";
+			dump_debug_info_method(module);
+			f << indent << "}\n";
+			f << "\n";
+		}
 	}
 
 	void dump_design(RTLIL::Design *design)
 	{
+		RTLIL::Module *top_module = nullptr;
 		std::vector<RTLIL::Module*> modules;
 		TopoSort<RTLIL::Module*> topo_design;
 		for (auto module : design->modules()) {
@@ -1735,6 +1850,8 @@ struct CxxrtlWorker {
 				modules.push_back(module); // cxxrtl blackboxes first
 			if (module->get_blackbox_attribute() || module->get_bool_attribute(ID(cxxrtl_blackbox)))
 				continue;
+			if (module->get_bool_attribute(ID::top))
+				top_module = module;
 
 			topo_design.node(module);
 			for (auto cell : module->cells()) {
@@ -1745,7 +1862,8 @@ struct CxxrtlWorker {
 				topo_design.edge(cell_module, module);
 			}
 		}
-		log_assert(topo_design.sort());
+		bool no_loops = topo_design.sort();
+		log_assert(no_loops);
 		modules.insert(modules.end(), topo_design.sorted.begin(), topo_design.sorted.end());
 
 		if (split_intf) {
@@ -1756,6 +1874,25 @@ struct CxxrtlWorker {
 			f << "#ifndef " << include_guard << "\n";
 			f << "#define " << include_guard << "\n";
 			f << "\n";
+			if (top_module != nullptr && debug_info) {
+				f << "#include <backends/cxxrtl/cxxrtl_capi.h>\n";
+				f << "\n";
+				f << "#ifdef __cplusplus\n";
+				f << "extern \"C\" {\n";
+				f << "#endif\n";
+				f << "\n";
+				f << "cxxrtl_toplevel " << design_ns << "_create();\n";
+				f << "\n";
+				f << "#ifdef __cplusplus\n";
+				f << "}\n";
+				f << "#endif\n";
+				f << "\n";
+			} else {
+				f << "// The CXXRTL C API is not available because the design is built without debug information.\n";
+				f << "\n";
+			}
+			f << "#ifdef __cplusplus\n";
+			f << "\n";
 			f << "#include <backends/cxxrtl/cxxrtl.h>\n";
 			f << "\n";
 			f << "using namespace cxxrtl;\n";
@@ -1766,6 +1903,8 @@ struct CxxrtlWorker {
 				dump_module_intf(module);
 			f << "} // namespace " << design_ns << "\n";
 			f << "\n";
+			f << "#endif // __cplusplus\n";
+			f << "\n";
 			f << "#endif\n";
 			*intf_f << f.str(); f.str("");
 		}
@@ -1775,6 +1914,15 @@ struct CxxrtlWorker {
 		else
 			f << "#include <backends/cxxrtl/cxxrtl.h>\n";
 		f << "\n";
+		f << "#if defined(CXXRTL_INCLUDE_CAPI_IMPL) || \\\n";
+		f << "    defined(CXXRTL_INCLUDE_VCD_CAPI_IMPL)\n";
+		f << "#include <backends/cxxrtl/cxxrtl_capi.cc>\n";
+		f << "#endif\n";
+		f << "\n";
+		f << "#if defined(CXXRTL_INCLUDE_VCD_CAPI_IMPL)\n";
+		f << "#include <backends/cxxrtl/cxxrtl_vcd_capi.cc>\n";
+		f << "#endif\n";
+		f << "\n";
 		f << "using namespace cxxrtl_yosys;\n";
 		f << "\n";
 		f << "namespace " << design_ns << " {\n";
@@ -1785,6 +1933,18 @@ struct CxxrtlWorker {
 			dump_module_impl(module);
 		}
 		f << "} // namespace " << design_ns << "\n";
+		f << "\n";
+		if (top_module != nullptr && debug_info) {
+			f << "cxxrtl_toplevel " << design_ns << "_create() {\n";
+			inc_indent();
+				std::string top_type = design_ns + "::" + mangle(top_module);
+				f << indent << "return new _cxxrtl_toplevel { ";
+				f << "std::unique_ptr<" << top_type << ">(new " + top_type + ")";
+				f << " };\n";
+			dec_indent();
+			f << "}\n";
+		}
+
 		*impl_f << f.str(); f.str("");
 	}
 
@@ -1813,7 +1973,7 @@ struct CxxrtlWorker {
 	void analyze_design(RTLIL::Design *design)
 	{
 		bool has_feedback_arcs = false;
-		bool has_buffered_wires = false;
+		bool has_buffered_comb_wires = false;
 
 		for (auto module : design->modules()) {
 			if (!design->selected_module(module))
@@ -1825,6 +1985,8 @@ struct CxxrtlWorker {
 			if (module->get_bool_attribute(ID(cxxrtl_blackbox))) {
 				for (auto port : module->ports) {
 					RTLIL::Wire *wire = module->wire(port);
+					if (wire->port_input && !wire->port_output)
+						unbuffered_wires.insert(wire);
 					if (wire->has_attribute(ID(cxxrtl_edge))) {
 						RTLIL::Const edge_attr = wire->attributes[ID(cxxrtl_edge)];
 						if (!(edge_attr.flags & RTLIL::CONST_FLAG_STRING) || (int)edge_attr.decode_string().size() != GetSize(wire))
@@ -1880,7 +2042,7 @@ struct CxxrtlWorker {
 				FlowGraph::Node *node = flow.add_node(cell);
 
 				// Various DFF cells are treated like posedge/negedge processes, see above for details.
-				if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($dffsr))) {
+				if (cell->type.in(ID($dff), ID($dffe), ID($adff), ID($adffe), ID($dffsr), ID($dffsre), ID($sdff), ID($sdffe), ID($sdffce))) {
 					if (cell->getPort(ID::CLK).is_wire())
 						register_edge_signal(sigmap, cell->getPort(ID::CLK),
 							cell->parameters[ID::CLK_POLARITY].as_bool() ? RTLIL::STp : RTLIL::STn);
@@ -2013,12 +2175,16 @@ struct CxxrtlWorker {
 
 			for (auto wire : module->wires()) {
 				if (feedback_wires[wire]) continue;
-				if (wire->port_id != 0) continue;
+				if (wire->port_output && !module->get_bool_attribute(ID::top)) continue;
+				if (wire->name.begins_with("$") && !unbuffer_internal) continue;
+				if (wire->name.begins_with("\\") && !unbuffer_public) continue;
+				if (flow.wire_sync_defs.count(wire) > 0) continue;
+				unbuffered_wires.insert(wire);
+				if (edge_wires[wire]) continue;
 				if (wire->get_bool_attribute(ID::keep)) continue;
+				if (wire->port_input || wire->port_output) continue;
 				if (wire->name.begins_with("$") && !localize_internal) continue;
 				if (wire->name.begins_with("\\") && !localize_public) continue;
-				if (edge_wires[wire]) continue;
-				if (flow.wire_sync_defs.count(wire) > 0) continue;
 				localized_wires.insert(wire);
 			}
 
@@ -2028,35 +2194,72 @@ struct CxxrtlWorker {
 			// it is possible that a design with no feedback arcs would end up with doubly buffered wires in such cases
 			// as a wire with multiple drivers where one of them is combinatorial and the other is synchronous. Such designs
 			// also require more than one delta cycle to converge.
-			pool<const RTLIL::Wire*> buffered_wires;
+			pool<const RTLIL::Wire*> buffered_comb_wires;
 			for (auto wire : module->wires()) {
-				if (flow.wire_comb_defs[wire].size() > 0 && !elided_wires.count(wire) && !localized_wires[wire]) {
-					if (!feedback_wires[wire])
-						buffered_wires.insert(wire);
-				}
+				if (flow.wire_comb_defs[wire].size() > 0 && !unbuffered_wires[wire] && !feedback_wires[wire])
+					buffered_comb_wires.insert(wire);
 			}
-			if (!buffered_wires.empty()) {
-				has_buffered_wires = true;
+			if (!buffered_comb_wires.empty()) {
+				has_buffered_comb_wires = true;
 				log("Module `%s' contains buffered combinatorial wires:\n", log_id(module));
-				for (auto wire : buffered_wires)
+				for (auto wire : buffered_comb_wires)
 					log("  %s\n", log_id(wire));
 			}
 
-			eval_converges[module] = feedback_wires.empty() && buffered_wires.empty();
+			eval_converges[module] = feedback_wires.empty() && buffered_comb_wires.empty();
+
+			if (debug_info) {
+				// Find wires that alias other wires or are tied to a constant; debug information can be enriched with these
+				// at essentially zero additional cost.
+				//
+				// Note that the information collected here can't be used for optimizing the netlist: debug information queries
+				// are pure and run on a design in a stable state, which allows assumptions that do not otherwise hold.
+				for (auto wire : module->wires()) {
+					if (wire->name[0] != '\\')
+						continue;
+					if (!unbuffered_wires[wire])
+						continue;
+					const RTLIL::Wire *wire_it = wire;
+					while (1) {
+						if (!(flow.wire_def_elidable.count(wire_it) && flow.wire_def_elidable[wire_it]))
+							break; // not an alias: complex def
+						log_assert(flow.wire_comb_defs[wire_it].size() == 1);
+						FlowGraph::Node *node = *flow.wire_comb_defs[wire_it].begin();
+						if (node->type != FlowGraph::Node::Type::CONNECT)
+							break; // not an alias: def by cell
+						RTLIL::SigSpec rhs_sig = node->connect.second;
+						if (rhs_sig.is_wire()) {
+							RTLIL::Wire *rhs_wire = rhs_sig.as_wire();
+							if (unbuffered_wires[rhs_wire]) {
+								wire_it = rhs_wire; // maybe an alias
+							} else {
+								debug_alias_wires[wire] = rhs_wire; // is an alias
+								break;
+							}
+						} else if (rhs_sig.is_fully_const()) {
+							debug_const_wires[wire] = rhs_sig.as_const(); // is a const
+							break;
+						} else {
+							break; // not an alias: complex rhs
+						}
+					}
+				}
+			}
 		}
-		if (has_feedback_arcs || has_buffered_wires) {
+		if (has_feedback_arcs || has_buffered_comb_wires) {
 			// Although both non-feedback buffered combinatorial wires and apparent feedback wires may be eliminated
-			// by optimizing the design, if after `opt_clean -purge` there are any feedback wires remaining, it is very
+			// by optimizing the design, if after `proc; flatten` there are any feedback wires remaining, it is very
 			// likely that these feedback wires are indicative of a true logic loop, so they get emphasized in the message.
 			const char *why_pessimistic = nullptr;
 			if (has_feedback_arcs)
 				why_pessimistic = "feedback wires";
-			else if (has_buffered_wires)
+			else if (has_buffered_comb_wires)
 				why_pessimistic = "buffered combinatorial wires";
-			log("\n");
 			log_warning("Design contains %s, which require delta cycles during evaluation.\n", why_pessimistic);
-			if (!max_opt_level)
-				log("Increasing the optimization level may eliminate %s from the design.\n", why_pessimistic);
+			if (!run_flatten)
+				log("Flattening may eliminate %s from the design.\n", why_pessimistic);
+			if (!run_proc)
+				log("Converting processes to netlists may eliminate %s from the design.\n", why_pessimistic);
 		}
 	}
 
@@ -2087,37 +2290,47 @@ struct CxxrtlWorker {
 
 	void prepare_design(RTLIL::Design *design)
 	{
+		bool did_anything = false;
 		bool has_sync_init, has_packed_mem;
 		log_push();
 		check_design(design, has_sync_init, has_packed_mem);
-		if (run_proc_flatten) {
-			Pass::call(design, "proc");
+		if (run_flatten) {
 			Pass::call(design, "flatten");
+			did_anything = true;
+		}
+		if (run_proc) {
+			Pass::call(design, "proc");
+			did_anything = true;
 		} else if (has_sync_init) {
 			// We're only interested in proc_init, but it depends on proc_prune and proc_clean, so call those
 			// in case they weren't already. (This allows `yosys foo.v -o foo.cc` to work.)
 			Pass::call(design, "proc_prune");
 			Pass::call(design, "proc_clean");
 			Pass::call(design, "proc_init");
+			did_anything = true;
 		}
-		if (has_packed_mem)
+		if (has_packed_mem) {
 			Pass::call(design, "memory_unpack");
+			did_anything = true;
+		}
 		// Recheck the design if it was modified.
 		if (has_sync_init || has_packed_mem)
 			check_design(design, has_sync_init, has_packed_mem);
 		log_assert(!(has_sync_init || has_packed_mem));
-		if (run_opt_clean_purge)
-			Pass::call(design, "opt_clean -purge");
 		log_pop();
+		if (did_anything)
+			log_spacer();
 		analyze_design(design);
 	}
 };
 
 struct CxxrtlBackend : public Backend {
 	static const int DEFAULT_OPT_LEVEL = 6;
+	static const int OPT_LEVEL_DEBUG = 4;
+	static const int DEFAULT_DEBUG_LEVEL = 1;
 
 	CxxrtlBackend() : Backend("cxxrtl", "convert design to C++ RTL simulation") { }
-	void help() YS_OVERRIDE
+	void help() override
 	{
 		//   |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|
 		log("\n");
@@ -2136,9 +2349,9 @@ struct CxxrtlBackend : public Backend {
 		log("      top.step();\n");
 		log("      while (1) {\n");
 		log("        /* user logic */\n");
-		log("        top.p_clk = value<1> {0u};\n");
+		log("        top.p_clk.set(false);\n");
 		log("        top.step();\n");
-		log("        top.p_clk = value<1> {1u};\n");
+		log("        top.p_clk.set(true);\n");
 		log("        top.step();\n");
 		log("      }\n");
 		log("    }\n");
@@ -2285,6 +2498,17 @@ struct CxxrtlBackend : public Backend {
 		log("        place the generated code into namespace <ns-name>. if not specified,\n");
 		log("        \"cxxrtl_design\" is used.\n");
 		log("\n");
+		log("    -noflatten\n");
+		log("        don't flatten the design. fully flattened designs can evaluate within\n");
+		log("        one delta cycle if they have no combinatorial feedback.\n");
+		log("        note that the debug interface and waveform dumps use full hierarchical\n");
+		log("        names for all wires even in flattened designs.\n");
+		log("\n");
+		log("    -noproc\n");
+		log("        don't convert processes to netlists. in most designs, converting\n");
+		log("        processes significantly improves evaluation performance at the cost of\n");
+		log("        slight increase in compilation time.\n");
+		log("\n");
 		log("    -O <level>\n");
 		log("        set the optimization level. the default is -O%d. higher optimization\n", DEFAULT_OPT_LEVEL);
 		log("        levels dramatically decrease compile and run time, and highest level\n");
@@ -2294,27 +2518,46 @@ struct CxxrtlBackend : public Backend {
 		log("        no optimization.\n");
 		log("\n");
 		log("    -O1\n");
-		log("        elide internal wires if possible.\n");
+		log("        localize internal wires if possible.\n");
 		log("\n");
 		log("    -O2\n");
-		log("        like -O1, and localize internal wires if possible.\n");
+		log("        like -O1, and unbuffer internal wires if possible.\n");
 		log("\n");
 		log("    -O3\n");
-		log("        like -O2, and elide public wires not marked (*keep*) if possible.\n");
+		log("        like -O2, and elide internal wires if possible.\n");
 		log("\n");
 		log("    -O4\n");
-		log("        like -O3, and localize public wires not marked (*keep*) if possible.\n");
+		log("        like -O3, and unbuffer public wires not marked (*keep*) if possible.\n");
 		log("\n");
 		log("    -O5\n");
-		log("        like -O4, and run `opt_clean -purge` first.\n");
+		log("        like -O4, and localize public wires not marked (*keep*) if possible.\n");
 		log("\n");
 		log("    -O6\n");
-		log("        like -O5, and run `proc; flatten` first.\n");
+		log("        like -O5, and elide public wires not marked (*keep*) if possible.\n");
+		log("\n");
+		log("    -Og\n");
+		log("        highest optimization level that provides debug information for all\n");
+		log("        public wires. currently, alias for -O%d.\n", OPT_LEVEL_DEBUG);
+		log("\n");
+		log("    -g <level>\n");
+		log("        set the debug level. the default is -g%d. higher debug levels provide\n", DEFAULT_DEBUG_LEVEL);
+		log("        more visibility and generate more code, but do not pessimize evaluation.\n");
+		log("\n");
+		log("    -g0\n");
+		log("        no debug information.\n");
+		log("\n");
+		log("    -g1\n");
+		log("        debug information for non-optimized public wires. this also makes it\n");
+		log("        possible to use the C API.\n");
 		log("\n");
 	}
-	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) YS_OVERRIDE
+
+	void execute(std::ostream *&f, std::string filename, std::vector<std::string> args, RTLIL::Design *design) override
 	{
+		bool noflatten = false;
+		bool noproc = false;
 		int opt_level = DEFAULT_OPT_LEVEL;
+		int debug_level = DEFAULT_DEBUG_LEVEL;
 		CxxrtlWorker worker;
 
 		log_header(design, "Executing CXXRTL backend.\n");
@@ -2322,6 +2565,23 @@ struct CxxrtlBackend : public Backend {
 		size_t argidx;
 		for (argidx = 1; argidx < args.size(); argidx++)
 		{
+			if (args[argidx] == "-noflatten") {
+				noflatten = true;
+				continue;
+			}
+			if (args[argidx] == "-noproc") {
+				noproc = true;
+				continue;
+			}
+			if (args[argidx] == "-Og") {
+				opt_level = OPT_LEVEL_DEBUG;
+				continue;
+			}
+			if (args[argidx] == "-O" && argidx+1 < args.size() && args[argidx+1] == "g") {
+				argidx++;
+				opt_level = OPT_LEVEL_DEBUG;
+				continue;
+			}
 			if (args[argidx] == "-O" && argidx+1 < args.size()) {
 				opt_level = std::stoi(args[++argidx]);
 				continue;
@@ -2330,6 +2590,14 @@ struct CxxrtlBackend : public Backend {
 				opt_level = std::stoi(args[argidx].substr(2));
 				continue;
 			}
+			if (args[argidx] == "-g" && argidx+1 < args.size()) {
+				debug_level = std::stoi(args[++argidx]);
+				continue;
+			}
+			if (args[argidx].substr(0, 2) == "-g" && args[argidx].size() == 3 && isdigit(args[argidx][2])) {
+				debug_level = std::stoi(args[argidx].substr(2));
+				continue;
+			}
 			if (args[argidx] == "-header") {
 				worker.split_intf = true;
 				continue;
@@ -2342,31 +2610,43 @@ struct CxxrtlBackend : public Backend {
 		}
 		extra_args(f, filename, args, argidx);
 
+		worker.run_flatten = !noflatten;
+		worker.run_proc = !noproc;
 		switch (opt_level) {
+			// the highest level here must match DEFAULT_OPT_LEVEL
 			case 6:
-				worker.max_opt_level = true;
-				worker.run_proc_flatten = true;
+				worker.elide_public = true;
 				YS_FALLTHROUGH
 			case 5:
-				worker.run_opt_clean_purge = true;
+				worker.localize_public = true;
 				YS_FALLTHROUGH
 			case 4:
-				worker.localize_public = true;
+				worker.unbuffer_public = true;
 				YS_FALLTHROUGH
 			case 3:
-				worker.elide_public = true;
+				worker.elide_internal = true;
 				YS_FALLTHROUGH
 			case 2:
 				worker.localize_internal = true;
 				YS_FALLTHROUGH
 			case 1:
-				worker.elide_internal = true;
+				worker.unbuffer_internal = true;
 				YS_FALLTHROUGH
 			case 0:
 				break;
 			default:
 				log_cmd_error("Invalid optimization level %d.\n", opt_level);
 		}
+		switch (debug_level) {
+			// the highest level here must match DEFAULT_DEBUG_LEVEL
+			case 1:
+				worker.debug_info = true;
+				YS_FALLTHROUGH
+			case 0:
+				break;
+			default:
+				log_cmd_error("Invalid debug information level %d.\n", debug_level);
+		}
 
 		std::ofstream intf_f;
 		if (worker.split_intf) {
diff --git a/backends/cxxrtl/cxxrtl_capi.cc b/backends/cxxrtl/cxxrtl_capi.cc
new file mode 100644
index 000000000..e0566e152
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl_capi.cc
@@ -0,0 +1,63 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2020  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+// This file is a part of the CXXRTL C API. It should be used together with `cxxrtl_capi.h`.
+
+#include <backends/cxxrtl/cxxrtl.h>
+#include <backends/cxxrtl/cxxrtl_capi.h>
+
+struct _cxxrtl_handle {
+	std::unique_ptr<cxxrtl::module> module;
+	cxxrtl::debug_items objects;
+};
+
+// Private function for use by other units of the C API.
+const cxxrtl::debug_items &cxxrtl_debug_items_from_handle(cxxrtl_handle handle) {
+	return handle->objects;
+}
+
+cxxrtl_handle cxxrtl_create(cxxrtl_toplevel design) {
+	cxxrtl_handle handle = new _cxxrtl_handle;
+	handle->module = std::move(design->module);
+	handle->module->debug_info(handle->objects);
+	delete design;
+	return handle;
+}
+
+void cxxrtl_destroy(cxxrtl_handle handle) {
+	delete handle;
+}
+
+size_t cxxrtl_step(cxxrtl_handle handle) {
+	return handle->module->step();
+}
+
+struct cxxrtl_object *cxxrtl_get_parts(cxxrtl_handle handle, const char *name, size_t *parts) {
+	auto it = handle->objects.table.find(name);
+	if (it == handle->objects.table.end())
+		return nullptr;
+	*parts = it->second.size();
+	return static_cast<cxxrtl_object*>(&it->second[0]);
+}
+
+void cxxrtl_enum(cxxrtl_handle handle, void *data,
+                 void (*callback)(void *data, const char *name,
+                                  cxxrtl_object *object, size_t parts)) {
+	for (auto &it : handle->objects.table)
+		callback(data, it.first.c_str(), static_cast<cxxrtl_object*>(&it.second[0]), it.second.size());
+}
diff --git a/backends/cxxrtl/cxxrtl_capi.h b/backends/cxxrtl/cxxrtl_capi.h
new file mode 100644
index 000000000..599284898
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl_capi.h
@@ -0,0 +1,185 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2020  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef CXXRTL_CAPI_H
+#define CXXRTL_CAPI_H
+
+// This file is a part of the CXXRTL C API. It should be used together with `cxxrtl_capi.cc`.
+//
+// The CXXRTL C API makes it possible to drive CXXRTL designs using C or any other language that
+// supports the C ABI, for example, Python. It does not provide a way to implement black boxes.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque reference to a design toplevel.
+//
+// A design toplevel can only be used to create a design handle.
+typedef struct _cxxrtl_toplevel *cxxrtl_toplevel;
+
+// The constructor for a design toplevel is provided as a part of generated code for that design.
+// Its prototype matches:
+//
+// cxxrtl_toplevel <design-name>_create();
+
+// Opaque reference to a design handle.
+//
+// A design handle is required by all operations in the C API.
+typedef struct _cxxrtl_handle *cxxrtl_handle;
+
+// Create a design handle from a design toplevel.
+//
+// The `design` is consumed by this operation and cannot be used afterwards.
+cxxrtl_handle cxxrtl_create(cxxrtl_toplevel design);
+
+// Release all resources used by a design and its handle.
+void cxxrtl_destroy(cxxrtl_handle handle);
+
+// Simulate the design to a fixed point.
+//
+// Returns the number of delta cycles.
+size_t cxxrtl_step(cxxrtl_handle handle);
+
+// Type of a simulated object.
+enum cxxrtl_type {
+	// Values correspond to singly buffered netlist nodes, i.e. nodes driven exclusively by
+	// combinatorial cells, or toplevel input nodes.
+	//
+	// Values can be inspected via the `curr` pointer. If the `next` pointer is NULL, the value is
+	// driven by a constant and can never be modified. Otherwise, the value can be modified through
+	// the `next` pointer (which is equal to `curr` if not NULL). Note that changes to the bits
+	// driven by combinatorial cells will be ignored.
+	//
+	// Values always have depth 1.
+	CXXRTL_VALUE = 0,
+
+	// Wires correspond to doubly buffered netlist nodes, i.e. nodes driven, at least in part, by
+	// storage cells, or by combinatorial cells that are a part of a feedback path.
+	//
+	// Wires can be inspected via the `curr` pointer and modified via the `next` pointer (which are
+	// distinct for wires). Note that changes to the bits driven by combinatorial cells will be
+	// ignored.
+	//
+	// Wires always have depth 1.
+	CXXRTL_WIRE = 1,
+
+	// Memories correspond to memory cells.
+	//
+	// Memories can be inspected and modified via the `curr` pointer. Due to a limitation of this
+	// API, memories cannot yet be modified in a guaranteed race-free way, and the `next` pointer is
+	// always NULL.
+	CXXRTL_MEMORY = 2,
+
+	// Aliases correspond to netlist nodes driven by another node such that their value is always
+	// exactly equal, or driven by a constant value.
+	//
+	// Aliases can be inspected via the `curr` pointer. They cannot be modified, and the `next`
+	// pointer is always NULL.
+	CXXRTL_ALIAS = 3,
+
+	// More object types may be added in the future, but the existing ones will never change.
+};
+
+// Description of a simulated object.
+//
+// The `data` array can be accessed directly to inspect and, if applicable, modify the bits
+// stored in the object.
+struct cxxrtl_object {
+	// Type of the object.
+	//
+	// All objects have the same memory layout determined by `width` and `depth`, but the type
+	// determines all other properties of the object.
+	uint32_t type; // actually `enum cxxrtl_type`
+
+	// Width of the object in bits.
+	size_t width;
+
+	// Index of the least significant bit.
+	size_t lsb_at;
+
+	// Depth of the object. Only meaningful for memories; for other objects, always 1.
+	size_t depth;
+
+	// Index of the first word. Only meaningful for memories; for other objects, always 0;
+	size_t zero_at;
+
+	// Bits stored in the object, as 32-bit chunks, least significant bits first.
+	//
+	// The width is rounded up to a multiple of 32; the padding bits are always set to 0 by
+	// the simulation code, and must be always written as 0 when modified by user code.
+	// In memories, every element is stored contiguously. Therefore, the total number of chunks
+	// in any object is `((width + 31) / 32) * depth`.
+	//
+	// To allow the simulation to be partitioned into multiple independent units communicating
+	// through wires, the bits are double buffered. To avoid race conditions, user code should
+	// always read from `curr` and write to `next`. The `curr` pointer is always valid; for objects
+	// that cannot be modified, or cannot be modified in a race-free way, `next` is NULL.
+	uint32_t *curr;
+	uint32_t *next;
+
+	// More description fields may be added in the future, but the existing ones will never change.
+};
+
+// Retrieve description of a simulated object.
+//
+// The `name` is the full hierarchical name of the object in the Yosys notation, where public names
+// have a `\` prefix and hierarchy levels are separated by single spaces. For example, if
+// the top-level module instantiates a module `foo`, which in turn contains a wire `bar`, the full
+// hierarchical name is `\foo \bar`.
+//
+// The storage of a single abstract object may be split (usually with the `splitnets` pass) into
+// many physical parts, all of which correspond to the same hierarchical name. To handle such cases,
+// this function returns an array and writes its length to `parts`. The array is sorted by `lsb_at`.
+//
+// Returns the object parts if it was found, NULL otherwise. The returned parts are valid until
+// the design is destroyed.
+struct cxxrtl_object *cxxrtl_get_parts(cxxrtl_handle handle, const char *name, size_t *parts);
+
+// Retrieve description of a single part simulated object.
+//
+// This function is a shortcut for the most common use of `cxxrtl_get_parts`. It asserts that,
+// if the object exists, it consists of a single part. If assertions are disabled, it returns NULL
+// for multi-part objects.
+inline struct cxxrtl_object *cxxrtl_get(cxxrtl_handle handle, const char *name) {
+	size_t parts = 0;
+	struct cxxrtl_object *object = cxxrtl_get_parts(handle, name, &parts);
+	assert(object == NULL || parts == 1);
+	if (object == NULL || parts == 1)
+		return object;
+	return NULL;
+}
+
+// Enumerate simulated objects.
+//
+// For every object in the simulation, `callback` is called with the provided `data`, the full
+// hierarchical name of the object (see `cxxrtl_get` for details), and the object parts.
+// The provided `name` and `object` values are valid until the design is destroyed.
+void cxxrtl_enum(cxxrtl_handle handle, void *data,
+                 void (*callback)(void *data, const char *name,
+                                  struct cxxrtl_object *object, size_t parts));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/backends/cxxrtl/cxxrtl_vcd.h b/backends/cxxrtl/cxxrtl_vcd.h
new file mode 100644
index 000000000..dbeabbaf2
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl_vcd.h
@@ -0,0 +1,244 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2020  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef CXXRTL_VCD_H
+#define CXXRTL_VCD_H
+
+#include <backends/cxxrtl/cxxrtl.h>
+
+namespace cxxrtl {
+
+class vcd_writer {
+	struct variable {
+		size_t ident;
+		size_t width;
+		chunk_t *curr;
+		size_t prev_off;
+	};
+
+	std::vector<std::string> current_scope;
+	std::vector<variable> variables;
+	std::vector<chunk_t> cache;
+	std::map<chunk_t*, size_t> aliases;
+	bool streaming = false;
+
+	void emit_timescale(unsigned number, const std::string &unit) {
+		assert(!streaming);
+		assert(number == 1 || number == 10 || number == 100);
+		assert(unit == "s" || unit == "ms" || unit == "us" ||
+		       unit == "ns" || unit == "ps" || unit == "fs");
+		buffer += "$timescale " + std::to_string(number) + " " + unit + " $end\n";
+	}
+
+	void emit_scope(const std::vector<std::string> &scope) {
+		assert(!streaming);
+		while (current_scope.size() > scope.size() ||
+		       (current_scope.size() > 0 &&
+			current_scope[current_scope.size() - 1] != scope[current_scope.size() - 1])) {
+			buffer += "$upscope $end\n";
+			current_scope.pop_back();
+		}
+		while (current_scope.size() < scope.size()) {
+			buffer += "$scope module " + scope[current_scope.size()] + " $end\n";
+			current_scope.push_back(scope[current_scope.size()]);
+		}
+	}
+
+	void emit_ident(size_t ident) {
+		do {
+			buffer += '!' + ident % 94; // "base94"
+			ident /= 94;
+		} while (ident != 0);
+	}
+
+	void emit_var(const variable &var, const std::string &type, const std::string &name,
+	              size_t lsb_at, bool multipart) {
+		assert(!streaming);
+		buffer += "$var " + type + " " + std::to_string(var.width) + " ";
+		emit_ident(var.ident);
+		buffer += " " + name;
+		if (multipart || name.back() == ']' || lsb_at != 0) {
+			if (var.width == 1)
+				buffer += " [" + std::to_string(lsb_at) + "]";
+			else
+				buffer += " [" + std::to_string(lsb_at + var.width - 1) + ":" + std::to_string(lsb_at) + "]";
+		}
+		buffer += " $end\n";
+	}
+
+	void emit_enddefinitions() {
+		assert(!streaming);
+		buffer += "$enddefinitions $end\n";
+		streaming = true;
+	}
+
+	void emit_time(uint64_t timestamp) {
+		assert(streaming);
+		buffer += "#" + std::to_string(timestamp) + "\n";
+	}
+
+	void emit_scalar(const variable &var) {
+		assert(streaming);
+		assert(var.width == 1);
+		buffer += (*var.curr ? '1' : '0');
+		emit_ident(var.ident);
+		buffer += '\n';
+	}
+
+	void emit_vector(const variable &var) {
+		assert(streaming);
+		buffer += 'b';
+		for (size_t bit = var.width - 1; bit != (size_t)-1; bit--) {
+			bool bit_curr = var.curr[bit / (8 * sizeof(chunk_t))] & (1 << (bit % (8 * sizeof(chunk_t))));
+			buffer += (bit_curr ? '1' : '0');
+		}
+		buffer += ' ';
+		emit_ident(var.ident);
+		buffer += '\n';
+	}
+
+	const variable &register_variable(size_t width, chunk_t *curr, bool constant = false) {
+		if (aliases.count(curr)) {
+			return variables[aliases[curr]];
+		} else {
+			const size_t chunks = (width + (sizeof(chunk_t) * 8 - 1)) / (sizeof(chunk_t) * 8);
+			aliases[curr] = variables.size();
+			if (constant) {
+				variables.emplace_back(variable { variables.size(), width, curr, (size_t)-1 });
+			} else {
+				variables.emplace_back(variable { variables.size(), width, curr, cache.size() });
+				cache.insert(cache.end(), &curr[0], &curr[chunks]);
+			}
+			return variables.back();
+		}
+	}
+
+	bool test_variable(const variable &var) {
+		if (var.prev_off == (size_t)-1)
+			return false; // constant
+		const size_t chunks = (var.width + (sizeof(chunk_t) * 8 - 1)) / (sizeof(chunk_t) * 8);
+		if (std::equal(&var.curr[0], &var.curr[chunks], &cache[var.prev_off])) {
+			return false;
+		} else {
+			std::copy(&var.curr[0], &var.curr[chunks], &cache[var.prev_off]);
+			return true;
+		}
+	}
+
+	static std::vector<std::string> split_hierarchy(const std::string &hier_name) {
+		std::vector<std::string> hierarchy;
+		size_t prev = 0;
+		while (true) {
+			size_t curr = hier_name.find_first_of(' ', prev);
+			if (curr == std::string::npos) {
+				hierarchy.push_back(hier_name.substr(prev));
+				break;
+			} else {
+				hierarchy.push_back(hier_name.substr(prev, curr - prev));
+				prev = curr + 1;
+			}
+		}
+		return hierarchy;
+	}
+
+public:
+	std::string buffer;
+
+	void timescale(unsigned number, const std::string &unit) {
+		emit_timescale(number, unit);
+	}
+
+	void add(const std::string &hier_name, const debug_item &item, bool multipart = false) {
+		std::vector<std::string> scope = split_hierarchy(hier_name);
+		std::string name = scope.back();
+		scope.pop_back();
+
+		emit_scope(scope);
+		switch (item.type) {
+			// Not the best naming but oh well...
+			case debug_item::VALUE:
+				emit_var(register_variable(item.width, item.curr, /*constant=*/item.next == nullptr),
+				         "wire", name, item.lsb_at, multipart);
+				break;
+			case debug_item::WIRE:
+				emit_var(register_variable(item.width, item.curr),
+				         "reg", name, item.lsb_at, multipart);
+				break;
+			case debug_item::MEMORY: {
+				const size_t stride = (item.width + (sizeof(chunk_t) * 8 - 1)) / (sizeof(chunk_t) * 8);
+				for (size_t index = 0; index < item.depth; index++) {
+					chunk_t *nth_curr = &item.curr[stride * index];
+					std::string nth_name = name + '[' + std::to_string(index) + ']';
+					emit_var(register_variable(item.width, nth_curr),
+					         "reg", nth_name, item.lsb_at, multipart);
+				}
+				break;
+			}
+			case debug_item::ALIAS:
+				// Like VALUE, but, even though `item.next == nullptr` always holds, the underlying value
+				// can actually change, and must be tracked. In most cases the VCD identifier will be
+				// unified with the aliased reg, but we should handle the case where only the alias is
+				// added to the VCD writer, too.
+				emit_var(register_variable(item.width, item.curr),
+				         "wire", name, item.lsb_at, multipart);
+				break;
+		}
+	}
+
+	template<class Filter>
+	void add(const debug_items &items, const Filter &filter) {
+		// `debug_items` is a map, so the items are already sorted in an order optimal for emitting
+		// VCD scope sections.
+		for (auto &it : items.table)
+			for (auto &part : it.second)
+				if (filter(it.first, part))
+					add(it.first, part, it.second.size() > 1);
+	}
+
+	void add(const debug_items &items) {
+		this->template add(items, [](const std::string &, const debug_item &) {
+			return true;
+		});
+	}
+
+	void add_without_memories(const debug_items &items) {
+		this->template add(items, [](const std::string &, const debug_item &item) {
+			return item.type != debug_item::MEMORY;
+		});
+	}
+
+	void sample(uint64_t timestamp) {
+		bool first_sample = !streaming;
+		if (first_sample) {
+			emit_scope({});
+			emit_enddefinitions();
+		}
+		emit_time(timestamp);
+		for (auto var : variables)
+			if (test_variable(var) || first_sample) {
+				if (var.width == 1)
+					emit_scalar(var);
+				else
+					emit_vector(var);
+			}
+	}
+};
+
+}
+
+#endif
diff --git a/backends/cxxrtl/cxxrtl_vcd_capi.cc b/backends/cxxrtl/cxxrtl_vcd_capi.cc
new file mode 100644
index 000000000..52a9198b8
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl_vcd_capi.cc
@@ -0,0 +1,83 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2020  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+// This file is a part of the CXXRTL C API. It should be used together with `cxxrtl_vcd_capi.h`.
+
+#include <backends/cxxrtl/cxxrtl_vcd.h>
+#include <backends/cxxrtl/cxxrtl_vcd_capi.h>
+
+extern const cxxrtl::debug_items &cxxrtl_debug_items_from_handle(cxxrtl_handle handle);
+
+struct _cxxrtl_vcd {
+	cxxrtl::vcd_writer writer;
+	bool flush = false;
+};
+
+cxxrtl_vcd cxxrtl_vcd_create() {
+	return new _cxxrtl_vcd;
+}
+
+void cxxrtl_vcd_destroy(cxxrtl_vcd vcd) {
+	delete vcd;
+}
+
+void cxxrtl_vcd_timescale(cxxrtl_vcd vcd, int number, const char *unit) {
+	vcd->writer.timescale(number, unit);
+}
+
+void cxxrtl_vcd_add(cxxrtl_vcd vcd, const char *name, cxxrtl_object *object) {
+	// Note the copy. We don't know whether `object` came from a design (in which case it is
+	// an instance of `debug_item`), or from user code (in which case it is an instance of
+	// `cxxrtl_object`), so casting the pointer wouldn't be safe.
+	vcd->writer.add(name, cxxrtl::debug_item(*object));
+}
+
+void cxxrtl_vcd_add_from(cxxrtl_vcd vcd, cxxrtl_handle handle) {
+	vcd->writer.add(cxxrtl_debug_items_from_handle(handle));
+}
+
+void cxxrtl_vcd_add_from_if(cxxrtl_vcd vcd, cxxrtl_handle handle, void *data,
+														int (*filter)(void *data, const char *name,
+														              const cxxrtl_object *object)) {
+	vcd->writer.add(cxxrtl_debug_items_from_handle(handle),
+		[=](const std::string &name, const cxxrtl::debug_item &item) {
+			return filter(data, name.c_str(), static_cast<const cxxrtl_object*>(&item));
+		});
+}
+
+void cxxrtl_vcd_add_from_without_memories(cxxrtl_vcd vcd, cxxrtl_handle handle) {
+	vcd->writer.add_without_memories(cxxrtl_debug_items_from_handle(handle));
+}
+
+void cxxrtl_vcd_sample(cxxrtl_vcd vcd, uint64_t time) {
+	if (vcd->flush) {
+		vcd->writer.buffer.clear();
+		vcd->flush = false;
+	}
+	vcd->writer.sample(time);
+}
+
+void cxxrtl_vcd_read(cxxrtl_vcd vcd, const char **data, size_t *size) {
+	if (vcd->flush) {
+		vcd->writer.buffer.clear();
+		vcd->flush = false;
+	}
+	*data = vcd->writer.buffer.c_str();
+	*size = vcd->writer.buffer.size();
+	vcd->flush = true;
+}
diff --git a/backends/cxxrtl/cxxrtl_vcd_capi.h b/backends/cxxrtl/cxxrtl_vcd_capi.h
new file mode 100644
index 000000000..d55afe223
--- /dev/null
+++ b/backends/cxxrtl/cxxrtl_vcd_capi.h
@@ -0,0 +1,107 @@
+/*
+ *  yosys -- Yosys Open SYnthesis Suite
+ *
+ *  Copyright (C) 2020  whitequark <whitequark@whitequark.org>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef CXXRTL_VCD_CAPI_H
+#define CXXRTL_VCD_CAPI_H
+
+// This file is a part of the CXXRTL C API. It should be used together with `cxxrtl_vcd_capi.cc`.
+//
+// The CXXRTL C API for VCD writing makes it possible to insert virtual probes into designs and
+// dump waveforms to Value Change Dump files.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <backends/cxxrtl/cxxrtl_capi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque reference to a VCD writer.
+typedef struct _cxxrtl_vcd *cxxrtl_vcd;
+
+// Create a VCD writer.
+cxxrtl_vcd cxxrtl_vcd_create();
+
+// Release all resources used by a VCD writer.
+void cxxrtl_vcd_destroy(cxxrtl_vcd vcd);
+
+// Set VCD timescale.
+//
+// The `number` must be 1, 10, or 100, and the `unit` must be one of `"s"`, `"ms"`, `"us"`, `"ns"`,
+// `"ps"`, or `"fs"`.
+//
+// Timescale can only be set before the first call to `cxxrtl_vcd_sample`.
+void cxxrtl_vcd_timescale(cxxrtl_vcd vcd, int number, const char *unit);
+
+// Schedule a specific CXXRTL object to be sampled.
+//
+// The `name` is a full hierarchical name as described for `cxxrtl_get`; it does not need to match
+// the original name of `object`, if any. The `object` must outlive the VCD writer, but there are
+// no other requirements; if desired, it can be provided by user code, rather than come from
+// a design.
+//
+// Objects can only be scheduled before the first call to `cxxrtl_vcd_sample`.
+void cxxrtl_vcd_add(cxxrtl_vcd vcd, const char *name, struct cxxrtl_object *object);
+
+// Schedule all CXXRTL objects in a simulation.
+//
+// The design `handle` must outlive the VCD writer.
+//
+// Objects can only be scheduled before the first call to `cxxrtl_vcd_sample`.
+void cxxrtl_vcd_add_from(cxxrtl_vcd vcd, cxxrtl_handle handle);
+
+// Schedule CXXRTL objects in a simulation that match a given predicate.
+//
+// For every object in the simulation, `filter` is called with the provided `data`, the full
+// hierarchical name of the object (see `cxxrtl_get` for details), and the object description.
+// The object will be sampled if the predicate returns a non-zero value.
+//
+// Objects can only be scheduled before the first call to `cxxrtl_vcd_sample`.
+void cxxrtl_vcd_add_from_if(cxxrtl_vcd vcd, cxxrtl_handle handle, void *data,
+                            int (*filter)(void *data, const char *name,
+                                          const struct cxxrtl_object *object));
+
+// Schedule all CXXRTL objects in a simulation except for memories.
+//
+// The design `handle` must outlive the VCD writer.
+//
+// Objects can only be scheduled before the first call to `cxxrtl_vcd_sample`.
+void cxxrtl_vcd_add_from_without_memories(cxxrtl_vcd vcd, cxxrtl_handle handle);
+
+// Sample all scheduled objects.
+//
+// First, `time` is written to the internal buffer. Second, the values of every signal changed since
+// the previous call to `cxxrtl_vcd_sample` (all values if this is the first call) are written to
+// the internal buffer. The contents of the buffer can be retrieved with `cxxrtl_vcd_read`.
+void cxxrtl_vcd_sample(cxxrtl_vcd vcd, uint64_t time);
+
+// Retrieve buffered VCD data.
+//
+// The pointer to the start of the next chunk of VCD data is assigned to `*data`, and the length
+// of that chunk is assigned to `*size`. The pointer to the data is valid until the next call to
+// `cxxrtl_vcd_sample` or `cxxrtl_vcd_read`. Once all of the buffered data has been retrieved,
+// this function will always return zero sized chunks.
+void cxxrtl_vcd_read(cxxrtl_vcd vcd, const char **data, size_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif