From be4c7cfb3419364a615d1c83c87ffab380bde752 Mon Sep 17 00:00:00 2001
From: Dan Callaghan <dcallagh@google.com>
Date: Fri, 11 Jun 2021 16:24:29 +1000
Subject: [PATCH] soc/cores/ram: allow populating initial values in Nexus LRAM

On designs which use Nexus parts without any external memory, it can be
difficult to fit an embedded ROM program larger than a few KiB. Radiant
cannot infer LRAM, and refuses to infer EBRAM under many circumstances
too, so large memories tend to just consume a huge number of LUTs.

This patch makes it possible to explicitly wire up an LRAM as a ROM,
populate its initial values with a program, and execute directly from
it. That lets us embed programs up to 64KiB.
---
 litex/soc/cores/ram/lattice_nx.py | 58 ++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/litex/soc/cores/ram/lattice_nx.py b/litex/soc/cores/ram/lattice_nx.py
index 52f77a12c..8a66bc86d 100644
--- a/litex/soc/cores/ram/lattice_nx.py
+++ b/litex/soc/cores/ram/lattice_nx.py
@@ -21,37 +21,68 @@ Note that this memory is dual port, but we only use a single port in this
 instantiation.
 """
 
+
+def initval_parameters(contents, width):
+    """
+    In Radiant, initial values for LRAM are passed a sequence of parameters
+    named INITVAL_00 ... INITVAL_7F. Each parameter value contains 4096 bits
+    of data, encoded as a 1280-digit hexadecimal number, with
+    alternating sequences of 8 bits of padding and 32 bits of real data,
+    making up 64KiB altogether.
+    """
+    assert width in [32, 64]
+    # Each LRAM is 64KiB == 524288 bits
+    assert len(contents) == 524288 // width
+    chunk_size = 4096 // width
+    parameters = []
+    for i in range(0x80):
+        name = 'INITVAL_{:02X}'.format(i)
+        offset = chunk_size * i
+        if width == 32:
+            value = '0x' + ''.join('00{:08X}'.format(contents[offset + j])
+                                   for j in range(chunk_size - 1, -1, -1))
+        elif width == 64:
+            value = '0x' + ''.join('00{:08X}00{:08X}'.format(contents[offset + j] >> 32, contents[offset + j] | 0xFFFFFF)
+                                   for j in range(chunk_size - 1, -1, -1))
+        parameters.append(Instance.Parameter(name, value))
+    return parameters
+
+
 class NXLRAM(Module):
     def __init__(self, width=32, size=128*kB):
         self.bus = wishbone.Interface(width)
         assert width in [32, 64]
+        self.width = width
+        self.size = size
 
         if width == 32:
             assert size in [64*kB, 128*kB, 192*kB, 256*kB, 320*kB]
-            depth_cascading = size//(64*kB)
-            width_cascading = 1
+            self.depth_cascading = size//(64*kB)
+            self.width_cascading = 1
         if width == 64:
             assert size in [128*kB, 256*kB]
-            depth_cascading = size//(128*kB)
-            width_cascading = 2
+            self.depth_cascading = size//(128*kB)
+            self.width_cascading = 2
 
+        self.lram_blocks = []
         # Combine RAMs to increase Depth.
-        for d in range(depth_cascading):
+        for d in range(self.depth_cascading):
+            self.lram_blocks.append([])
             # Combine RAMs to increase Width.
-            for w in range(width_cascading):
+            for w in range(self.width_cascading):
                 datain  = Signal(32)
                 dataout = Signal(32)
                 cs      = Signal()
                 wren    = Signal()
                 self.comb += [
                     datain.eq(self.bus.dat_w[32*w:32*(w+1)]),
-                    If(self.bus.adr[14:14+depth_cascading.bit_length()] == d,
+                    If(self.bus.adr[14:14+self.depth_cascading.bit_length()] == d,
                         cs.eq(1),
                         wren.eq(self.bus.we & self.bus.stb & self.bus.cyc),
                         self.bus.dat_r[32*w:32*(w+1)].eq(dataout)
                     ),
                 ]
-                self.specials += Instance("SP512K",
+                lram_block = Instance("SP512K",
                     p_ECC_BYTE_SEL = "BYTE_EN",
                     i_DI       = datain,
                     i_AD       = self.bus.adr[:14],
@@ -64,5 +95,16 @@ class NXLRAM(Module):
                     i_BYTEEN_N = ~self.bus.sel[4*w:4*(w+1)],
                     o_DO       = dataout
                 )
+                self.lram_blocks[d].append(lram_block)
+                self.specials += lram_block
 
         self.sync += self.bus.ack.eq(self.bus.stb & self.bus.cyc & ~self.bus.ack)
+
+    def add_initial_value(self, data):
+        # Pad it out to make slicing easier below.
+        data += [0] * (self.size // self.width * 8 - len(data))
+        for d in range(self.depth_cascading):
+            for w in range(self.width_cascading):
+                offset = d * self.width_cascading * 64*kB + w * 64*kB
+                chunk = data[offset:offset + 64*kB]
+                self.lram_blocks[d][w].items += initval_parameters(chunk, self.width)