2 files changed, 29 insertions, 24 deletions
diff --git a/backends/cxxrtl/cxxrtl.h b/backends/cxxrtl/cxxrtl.h
index 166b4896f..85f45ac7f 100644
--- a/backends/cxxrtl/cxxrtl.h
+++ b/backends/cxxrtl/cxxrtl.h
@@ -66,6 +66,7 @@ namespace cxxrtl {
 // Therefore, using relatively wide chunks and clearing the high bits explicitly and only when we know they may be
 // clobbered results in simpler generated code.
 typedef uint32_t chunk_t;
+typedef uint64_t wide_chunk_t;
 
 template<typename T>
 struct chunk_traits {
@@ -454,6 +455,24 @@ struct value : public expr_base<value<Bits>> {
 		bool overflow = (is_neg() == !other.is_neg()) && (is_neg() != result.is_neg());
 		return result.is_neg() ^ overflow; // a.scmp(b) ≡ a s< b
 	}
+
+	template<size_t ResultBits>
+	value<ResultBits> mul(const value<Bits> &other) const {
+		value<ResultBits> result;
+		wide_chunk_t wide_result[result.chunks + 1] = {};
+		for (size_t n = 0; n < chunks; n++) {
+			for (size_t m = 0; m < chunks && n + m < result.chunks; m++) {
+				wide_result[n + m] += wide_chunk_t(data[n]) * wide_chunk_t(other.data[m]);
+				wide_result[n + m + 1] += wide_result[n + m] >> chunk::bits;
+				wide_result[n + m] &= chunk::mask;
+			}
+		}
+		for (size_t n = 0; n < result.chunks; n++) {
+			result.data[n] = wide_result[n];
+		}
+		result.data[result.chunks - 1] &= result.msb_mask;
+		return result;
+	}
 };
 
 // Expression template for a slice, usable as lvalue or rvalue, and composable with other expression templates here.
@@ -1306,28 +1325,14 @@ value<BitsY> sub_ss(const value<BitsA> &a, const value<BitsB> &b) {
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_uu(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsY> product;
-	value<BitsY> multiplicand = a.template zcast<BitsY>();
-	const value<BitsB> &multiplier = b;
-	uint32_t multiplicand_shift = 0;
-	for (size_t step = 0; step < BitsB; step++) {
-		if (multiplier.bit(step)) {
-			multiplicand = multiplicand.shl(value<32> { multiplicand_shift });
-			product = product.add(multiplicand);
-			multiplicand_shift = 0;
-		}
-		multiplicand_shift++;
-	}
-	return product;
+	constexpr size_t BitsM = BitsA >= BitsB ? BitsA : BitsB;
+	return a.template zcast<BitsM>().template mul<BitsY>(b.template zcast<BitsM>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
 CXXRTL_ALWAYS_INLINE
 value<BitsY> mul_ss(const value<BitsA> &a, const value<BitsB> &b) {
-	value<BitsB + 1> ub = b.template sext<BitsB + 1>();
-	if (ub.is_neg()) ub = ub.neg();
-	value<BitsY> y = mul_uu<BitsY>(a.template scast<BitsY>(), ub);
-	return b.is_neg() ? y.neg() : y;
+	return a.template scast<BitsY>().template mul<BitsY>(b.template scast<BitsY>());
 }
 
 template<size_t BitsY, size_t BitsA, size_t BitsB>
diff --git a/techlibs/ice40/cells_sim.v b/techlibs/ice40/cells_sim.v
index ad572c877..7ee809262 100644
--- a/techlibs/ice40/cells_sim.v
+++ b/techlibs/ice40/cells_sim.v
@@ -2508,7 +2508,7 @@ module SB_SPRAM256KA (
 
 	always @(negedge POWEROFF) begin
 		for (i = 0; i <= 16383; i = i+1)
-			mem[i] = 'bx;
+			mem[i] = 16'bx;
 	end
 
 	always @(posedge CLOCK, posedge off) begin
@@ -2516,17 +2516,17 @@ module SB_SPRAM256KA (
 			DATAOUT <= 0;
 		end else
 		if (STANDBY) begin
-			DATAOUT <= 'bx;
+			DATAOUT <= 16'bx;
 		end else
 		if (CHIPSELECT) begin
 			if (!WREN) begin
 				DATAOUT <= mem[ADDRESS];
 			end else begin
-				if (MASKWREN[0]) mem[ADDRESS][ 3: 0] = DATAIN[ 3: 0];
-				if (MASKWREN[1]) mem[ADDRESS][ 7: 4] = DATAIN[ 7: 4];
-				if (MASKWREN[2]) mem[ADDRESS][11: 8] = DATAIN[11: 8];
-				if (MASKWREN[3]) mem[ADDRESS][15:12] = DATAIN[15:12];
-				DATAOUT <= 'bx;
+				if (MASKWREN[0]) mem[ADDRESS][ 3: 0] <= DATAIN[ 3: 0];
+				if (MASKWREN[1]) mem[ADDRESS][ 7: 4] <= DATAIN[ 7: 4];
+				if (MASKWREN[2]) mem[ADDRESS][11: 8] <= DATAIN[11: 8];
+				if (MASKWREN[3]) mem[ADDRESS][15:12] <= DATAIN[15:12];
+				DATAOUT <= 16'bx;
 			end
 		end
 	end