From 05ed238829db75206da0c2afbd4ee5d29b903483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C4=99drzej=20Boczar?= Date: Tue, 19 Jan 2021 14:48:17 +0100 Subject: [PATCH] lpddr4: split implementation into multiple files in separate directory --- litedram/init.py | 6 +- litedram/phy/lpddr4/__init__.py | 2 + litedram/phy/lpddr4/basephy.py | 312 ++++++++ litedram/phy/lpddr4/commands.py | 151 ++++ .../phy/{s7lpddr4phy.py => lpddr4/s7phy.py} | 6 +- litedram/phy/lpddr4/simphy.py | 192 +++++ litedram/phy/lpddr4/utils.py | 71 ++ litedram/phy/lpddr4phy.py | 695 ------------------ test/test_lpddr4.py | 30 +- 9 files changed, 751 insertions(+), 714 deletions(-) create mode 100644 litedram/phy/lpddr4/__init__.py create mode 100644 litedram/phy/lpddr4/basephy.py create mode 100644 litedram/phy/lpddr4/commands.py rename litedram/phy/{s7lpddr4phy.py => lpddr4/s7phy.py} (98%) create mode 100644 litedram/phy/lpddr4/simphy.py create mode 100644 litedram/phy/lpddr4/utils.py delete mode 100644 litedram/phy/lpddr4phy.py diff --git a/litedram/init.py b/litedram/init.py index 15be146..e84010c 100644 --- a/litedram/init.py +++ b/litedram/init.py @@ -554,9 +554,9 @@ def get_lpddr4_phy_init_sequence(phy_settings, timing_settings): ba = 0 return ("Load More Register {}".format(ma), a, ba, cmds["MODE_REGISTER"], 200) - from litedram.phy.lpddr4phy import DFIPhaseAdapter - zqc_start = DFIPhaseAdapter.MPC["ZQC-START"] - zqc_latch = DFIPhaseAdapter.MPC["ZQC-LATCH"] + from litedram.phy.lpddr4.commands import MPC + zqc_start = MPC["ZQC-START"] + zqc_latch = MPC["ZQC-LATCH"] init_sequence = [ ("Release reset", 0x0000, 0, cmds["UNRESET"], 50000), diff --git a/litedram/phy/lpddr4/__init__.py b/litedram/phy/lpddr4/__init__.py new file mode 100644 index 0000000..5ce2577 --- /dev/null +++ b/litedram/phy/lpddr4/__init__.py @@ -0,0 +1,2 @@ +from litedram.phy.lpddr4.s7phy import S7LPDDR4PHY +from litedram.phy.lpddr4.simphy import LPDDR4SimPHY diff --git a/litedram/phy/lpddr4/basephy.py b/litedram/phy/lpddr4/basephy.py new file mode 100644 index 0000000..ec7e6e6 --- /dev/null +++ b/litedram/phy/lpddr4/basephy.py @@ -0,0 +1,312 @@ +from operator import or_ +from functools import reduce +from collections import defaultdict + +from migen import * + +from litex.soc.interconnect.csr import * + +from litedram.common import * +from litedram.phy.dfi import * + +from litedram.phy.lpddr4.utils import bitpattern, delayed, ConstBitSlip, DQSPattern +from litedram.phy.lpddr4.commands import DFIPhaseAdapter + + +class LPDDR4PHY(Module, AutoCSR): + def __init__(self, pads, *, + sys_clk_freq, write_ser_latency, read_des_latency, phytype, cmd_delay=None): + self.pads = pads + self.memtype = memtype = "LPDDR4" + self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len(pads.cs_n) + self.databits = databits = len(pads.dq) + self.addressbits = addressbits = 17 # for activate row address + self.bankbits = bankbits = 3 + self.nphases = nphases = 8 + self.tck = tck = 1 / (nphases*sys_clk_freq) + assert databits % 8 == 0 + + # Parameters ------------------------------------------------------------------------------- + def get_cl_cw(memtype, tck): + # MT53E256M16D1, No DBI, Set A + f_to_cl_cwl = OrderedDict() + f_to_cl_cwl[ 532e6] = ( 6, 4) # FIXME: with that low cwl, wrtap is 0 + f_to_cl_cwl[1066e6] = (10, 6) + f_to_cl_cwl[1600e6] = (14, 8) + f_to_cl_cwl[2132e6] = (20, 10) + f_to_cl_cwl[2666e6] = (24, 12) + f_to_cl_cwl[3200e6] = (28, 14) + f_to_cl_cwl[3732e6] = (32, 16) + f_to_cl_cwl[4266e6] = (36, 18) + for f, (cl, cwl) in f_to_cl_cwl.items(): + if tck >= 2/f: + return cl, cwl + raise ValueError + + # Bitslip introduces latency between from `cycles` up to `cycles + 1` + bitslip_cycles = 1 + # Commands are sent over 4 cycles of DRAM clock (sys8x) + cmd_latency = 4 + # Commands read from adapters are delayed on ConstBitSlips + ca_latency = 1 + + cl, cwl = get_cl_cw(memtype, tck) + cl_sys_latency = get_sys_latency(nphases, cl) + cwl_sys_latency = get_sys_latency(nphases, cwl) + rdphase = get_sys_phase(nphases, cl_sys_latency, cl + cmd_latency) + wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency) + + # When the calculated phase is negative, it means that we need to increase sys latency + def updated_latency(phase): + delay_update = 0 + while phase < 0: + phase += nphases + delay_update += 1 + return phase, delay_update + + wrphase, cwl_sys_delay = updated_latency(wrphase) + rdphase, cl_sys_delay = updated_latency(rdphase) + cwl_sys_latency += cwl_sys_delay + cl_sys_latency += cl_sys_delay + + # Read latency + read_data_delay = ca_latency + write_ser_latency + cl_sys_latency # DFI cmd -> read data on DQ + read_des_delay = read_des_latency + bitslip_cycles # data on DQ -> data on DFI rddata + read_latency = read_data_delay + read_des_delay + + # Write latency + write_latency = cwl_sys_latency + + # FIXME: remove + if __import__("os").environ.get("DEBUG") == '1': + print('cl', end=' = '); __import__('pprint').pprint(cl) + print('cwl', end=' = '); __import__('pprint').pprint(cwl) + print('cl_sys_latency', end=' = '); __import__('pprint').pprint(cl_sys_latency) + print('cwl_sys_latency', end=' = '); __import__('pprint').pprint(cwl_sys_latency) + print('rdphase', end=' = '); __import__('pprint').pprint(rdphase) + print('wrphase', end=' = '); __import__('pprint').pprint(wrphase) + print('read_data_delay', end=' = '); __import__('pprint').pprint(read_data_delay) + print('read_des_delay', end=' = '); __import__('pprint').pprint(read_des_delay) + print('read_latency', end=' = '); __import__('pprint').pprint(read_latency) + print('write_latency', end=' = '); __import__('pprint').pprint(write_latency) + + # Registers -------------------------------------------------------------------------------- + self._rst = CSRStorage() + + self._dly_sel = CSRStorage(databits//8) + + self._wlevel_en = CSRStorage() + self._wlevel_strobe = CSR() + + self._dly_sel = CSRStorage(databits//8) + + self._rdly_dq_bitslip_rst = CSR() + self._rdly_dq_bitslip = CSR() + + self._wdly_dq_bitslip_rst = CSR() + self._wdly_dq_bitslip = CSR() + + self._rdphase = CSRStorage(log2_int(nphases), reset=rdphase) + self._wrphase = CSRStorage(log2_int(nphases), reset=wrphase) + + # PHY settings ----------------------------------------------------------------------------- + self.settings = PhySettings( + phytype = phytype, + memtype = memtype, + databits = databits, + dfi_databits = 2*databits, + nranks = nranks, + nphases = nphases, + rdphase = self._rdphase.storage, + wrphase = self._wrphase.storage, + cl = cl, + cwl = cwl, + read_latency = read_latency, + write_latency = write_latency, + cmd_latency = cmd_latency, + cmd_delay = cmd_delay, + ) + + # DFI Interface ---------------------------------------------------------------------------- + # Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a + # whole burst during a single controller clock cycle. PHY should use sys8x clock. + self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2*databits, nphases=8) + + # # # + + adapters = [DFIPhaseAdapter(phase) for phase in self.dfi.phases] + self.submodules += adapters + + # Now prepare the data by converting the sequences on adapters into sequences on the pads. + # We have to ignore overlapping commands, and module timings have to ensure that there are + # no overlapping commands anyway. + # Pads: reset_n, CS, CKE, CK, CA[5:0], DMI[1:0], DQ[15:0], DQS[1:0], ODT_CA + self.ck_clk = Signal(2*nphases) + self.ck_cke = Signal(nphases) + self.ck_odt = Signal(nphases) + self.ck_reset_n = Signal(nphases) + self.ck_cs = Signal(nphases) + self.ck_ca = [Signal(nphases) for _ in range(6)] + self.ck_dmi_o = [Signal(2*nphases) for _ in range(2)] + self.ck_dmi_i = [Signal(2*nphases) for _ in range(2)] + self.dmi_oe = Signal() + self.ck_dq_o = [Signal(2*nphases) for _ in range(databits)] + self.ck_dq_i = [Signal(2*nphases) for _ in range(databits)] + self.dq_oe = Signal() + self.ck_dqs_o = [Signal(2*nphases) for _ in range(2)] + self.ck_dqs_i = [Signal(2*nphases) for _ in range(2)] + self.dqs_oe = Signal() + + # Clocks ----------------------------------------------------------------------------------- + self.comb += self.ck_clk.eq(bitpattern("-_-_-_-_" * 2)) + + # Simple commands -------------------------------------------------------------------------- + self.comb += [ + self.ck_cke.eq(Cat(delayed(self, phase.cke) for phase in self.dfi.phases)), + self.ck_odt.eq(Cat(delayed(self, phase.odt) for phase in self.dfi.phases)), + self.ck_reset_n.eq(Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)), + ] + + # LPDDR4 Commands -------------------------------------------------------------------------- + # Each command can span several phases (up to 4), so we must ignore overlapping commands, + # but in general, module timings should be set in a way that overlapping will never happen. + + # Create a history of valid adapters used for masking overlapping ones. + # TODO: make optional, as it takes up resources and the controller should ensure no overlaps + valids = ConstBitSlip(dw=nphases, cycles=1, slp=0) + self.submodules += valids + self.comb += valids.i.eq(Cat(a.valid for a in adapters)) + # valids_hist = valids.r + valids_hist = Signal.like(valids.r) + # TODO: especially make this part optional + for i in range(len(valids_hist)): + was_valid_before = reduce(or_, valids_hist[max(0, i-3):i], 0) + self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before) + + cs_per_adapter = [] + ca_per_adapter = defaultdict(list) + for phase, adapter in enumerate(adapters): + # The signals from an adapter can be used if there were no commands on 3 previous cycles + allowed = ~reduce(or_, valids_hist[nphases+phase - 3:nphases+phase]) + + # Use CS and CA of given adapter slipped by `phase` bits + cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) + self.submodules += cs_bs + self.comb += cs_bs.i.eq(Cat(adapter.cs)), + cs_mask = Replicate(allowed, len(cs_bs.o)) + cs = cs_bs.o & cs_mask + cs_per_adapter.append(cs) + + # For CA we need to do the same for each bit + ca_bits = [] + for bit in range(6): + ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) + self.submodules += ca_bs + ca_bit_hist = [adapter.ca[i][bit] for i in range(4)] + self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)), + ca_mask = Replicate(allowed, len(ca_bs.o)) + ca = ca_bs.o & ca_mask + ca_per_adapter[bit].append(ca) + + # OR all the masked signals + self.comb += self.ck_cs.eq(reduce(or_, cs_per_adapter)) + for bit in range(6): + self.comb += self.ck_ca[bit].eq(reduce(or_, ca_per_adapter[bit])) + + # DQ --------------------------------------------------------------------------------------- + dq_oe = Signal() + self.comb += self.dq_oe.eq(delayed(self, dq_oe, cycles=1)) + + for bit in range(self.databits): + # output + self.submodules += BitSlip( + dw = 2*nphases, + cycles = bitslip_cycles, + rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re, + i = Cat(*[self.dfi.phases[i//2].wrdata[i%2 * self.databits + bit] for i in range(2*nphases)]), + o = self.ck_dq_o[bit], + ) + + # input + dq_i_bs = Signal(2*nphases) + self.submodules += BitSlip( + dw = 2*nphases, + cycles = bitslip_cycles, + rst = (self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip.re, + i = self.ck_dq_i[bit], + o = dq_i_bs, + ) + for i in range(2*nphases): + self.comb += self.dfi.phases[i//2].rddata[i%2 * self.databits + bit].eq(dq_i_bs[i]) + + # DQS -------------------------------------------------------------------------------------- + dqs_oe = Signal() + dqs_preamble = Signal() + dqs_postamble = Signal() + dqs_pattern = DQSPattern( + preamble = dqs_preamble, # FIXME: are defined the opposite way (common.py) ??? + postamble = dqs_postamble, + wlevel_en = self._wlevel_en.storage, + wlevel_strobe = self._wlevel_strobe.re) + self.submodules += dqs_pattern + self.comb += [ + self.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)), + ] + + for bit in range(self.databits//8): + # output + self.submodules += BitSlip( + dw = 2*nphases, + cycles = bitslip_cycles, + rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re, + i = dqs_pattern.o, + o = self.ck_dqs_o[bit], + ) + + # DMI -------------------------------------------------------------------------------------- + # DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values. + # With DM and DBI disabled, this signal is a Don't Care. + # With DM enabled, masking is performed only when the command used is WRITE-MASKED. + # TODO: use WRITE-MASKED for all write commands, and configure Mode Registers for that + # during DRAM initialization (we don't want to support DBI). + for bin in range(self.databits//8): + self.comb += self.ck_dmi_o[bit].eq(0) + + # Read Control Path ------------------------------------------------------------------------ + # Creates a delay line of read commands coming from the DFI interface. The output is used to + # signal a valid read data to the DFI interface. + # + # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI + # interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies. + rddata_en = TappedDelayLine( + signal = reduce(or_, [dfi.phases[i].rddata_en for i in range(nphases)]), + ntaps = self.settings.read_latency + ) + self.submodules += rddata_en + + self.comb += [phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases] + + # Write Control Path ----------------------------------------------------------------------- + wrtap = cwl_sys_latency - 1 + assert wrtap >= 1 + + # Create a delay line of write commands coming from the DFI interface. This taps are used to + # control DQ/DQS tristates. + wrdata_en = TappedDelayLine( + signal = reduce(or_, [dfi.phases[i].wrdata_en for i in range(nphases)]), + ntaps = wrtap + 2 + ) + self.submodules += wrdata_en + + self.comb += dq_oe.eq(wrdata_en.taps[wrtap]) + self.comb += If(self._wlevel_en.storage, dqs_oe.eq(1)).Else(dqs_oe.eq(dqs_preamble | dq_oe | dqs_postamble)) + + # Write DQS Postamble/Preamble Control Path ------------------------------------------------ + # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last + # write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles: + # 1 for Preamble, 1 for the Write and 1 for the Postamble. + self.comb += dqs_preamble.eq( wrdata_en.taps[wrtap - 1] & ~wrdata_en.taps[wrtap + 0]) + self.comb += dqs_postamble.eq(wrdata_en.taps[wrtap + 1] & ~wrdata_en.taps[wrtap + 0]) diff --git a/litedram/phy/lpddr4/commands.py b/litedram/phy/lpddr4/commands.py new file mode 100644 index 0000000..aa04db0 --- /dev/null +++ b/litedram/phy/lpddr4/commands.py @@ -0,0 +1,151 @@ +import re + +from migen import * + + +# MPC (multipurpose command) can be used to perform different actions +# We use ZQC with BA=0 to issue MPC, where OP[6:0] = A[6:0] +MPC = { + "NOP": 0b0000000, # only OP[6] must be 0 + "READ-FIFO": 0b1000001, + "READ-DQ-CAL": 0b1000011, + # RFU: 0b1000101 + "WRITE-FIFO": 0b1000111, + # RFU: 0b1001001 + "START-DQS-OSC": 0b1001011, + "STOP-DQS-OSC": 0b1001101, + "ZQC-START": 0b1001111, + "ZQC-LATCH": 0b1010001, +} + + +class DFIPhaseAdapter(Module): + """Translates DFI phase into LPDDR4 command (2- or 4-cycle) + + LPDDR4 "full command" consists of 1 or 2 "small commands". Each "small command" + is transmitted over 2 DRAM clock cycles (SDR). This module translates DFI commands + on a single DFI phase into sequencs on CS/CA[5:0] buses (4 cycles). Some DFI commands + consist only of a single "small command". To make counting DRAM timings easier, such + a "small command" shall be sent on the 2nd slot (i.e. 3rd and 4th cycle). All timings + are then counted starting from CS low on the 4th cycle. + """ + + def __init__(self, dfi_phase): + # CS/CA values for 4 SDR cycles + self.cs = Signal(4) + self.ca = Array([Signal(6) for _ in range(4)]) + self.valid = Signal() + + # # # + + self.submodules.cmd1 = Command(dfi_phase) + self.submodules.cmd2 = Command(dfi_phase) + self.comb += [ + self.cs[:2].eq(self.cmd1.cs), + self.cs[2:].eq(self.cmd2.cs), + self.ca[0].eq(self.cmd1.ca[0]), + self.ca[1].eq(self.cmd1.ca[1]), + self.ca[2].eq(self.cmd2.ca[0]), + self.ca[3].eq(self.cmd2.ca[1]), + ] + + dfi_cmd = Signal(3) + self.comb += dfi_cmd.eq(Cat(~dfi_phase.we_n, ~dfi_phase.ras_n, ~dfi_phase.cas_n)), + _cmd = { # cas, ras, we + "NOP": 0b000, + "ACT": 0b010, + "RD": 0b100, + "WR": 0b101, + "PRE": 0b011, + "REF": 0b110, + "ZQC": 0b001, + "MRS": 0b111, + } + + def cmds(cmd1, cmd2, valid=1): + return self.cmd1.set(cmd1) + self.cmd2.set(cmd2) + [self.valid.eq(valid)] + + self.comb += If(dfi_phase.cs_n == 0, # require dfi.cs_n + Case(dfi_cmd, { + _cmd["ACT"]: cmds("ACTIVATE-1", "ACTIVATE-2"), + _cmd["RD"]: cmds("READ-1", "CAS-2"), + _cmd["WR"]: cmds("WRITE-1", "CAS-2"), # TODO: masked write + _cmd["PRE"]: cmds("DESELECT", "PRECHARGE"), + _cmd["REF"]: cmds("DESELECT", "REFRESH"), + _cmd["ZQC"]: cmds("DESELECT", "MPC"), + _cmd["MRS"]: cmds("MRW-1", "MRW-2"), + "default": cmds("DESELECT", "DESELECT", valid=0), + }) + ) + + +class Command(Module): + """LPDDR4 command decoder + + Decodes a command from single DFI phase into LPDDR4 "small command" + consisting of 2 CS values and 2 CA[5:0] values. + + LPDDR4 "small commands" are transmited over 2 clock cycles. In first + cycle CS is driven high and in the second cycle it stays low. In each + of the cycles the bits on CA[5:0] are latched and interpreted differently. + This module translates a DFI command into the values of CS/CA that shall + be transmitted over 2 DRAM clock cycles. + """ + + # String description of 1st and 2nd edge of each command, later parsed to + # construct the value. CS is assumed to be H for 1st edge and L for 2nd edge. + TRUTH_TABLE = { + "MRW-1": ["L H H L L OP7", "MA0 MA1 MA2 MA3 MA4 MA5"], + "MRW-2": ["L H H L H OP6", "OP0 OP1 OP2 OP3 OP4 OP5"], + "MRR-1": ["L H H H L V", "MA0 MA1 MA2 MA3 MA4 MA5"], + "REFRESH": ["L L L H L AB", "BA0 BA1 BA2 V V V"], + "ACTIVATE-1": ["H L R12 R13 R14 R15", "BA0 BA1 BA2 R16 R10 R11"], + "ACTIVATE-2": ["H H R6 R7 R8 R9", "R0 R1 R2 R3 R4 R5"], + "WRITE-1": ["L L H L L BL", "BA0 BA1 BA2 V C9 AP"], + "MASK WRITE-1": ["L L H H L BL", "BA0 BA1 BA2 V C9 AP"], + "READ-1": ["L H L L L BL", "BA0 BA1 BA2 V C9 AP"], + "CAS-2": ["L H L L H C8", "C2 C3 C4 C5 C6 C7"], + "PRECHARGE": ["L L L L H AB", "BA0 BA1 BA2 V V V"], + "MPC": ["L L L L L OP6", "OP0 OP1 OP2 OP3 OP4 OP5"], + "DESELECT": ["X X X X X X", "X X X X X X"], + } + + for cmd, (subcmd1, subcmd2) in TRUTH_TABLE.items(): + assert len(subcmd1.split()) == 6, (cmd, subcmd1) + assert len(subcmd2.split()) == 6, (cmd, subcmd2) + + def __init__(self, dfi_phase): + self.cs = Signal(2) + self.ca = Array([Signal(6), Signal(6)]) # CS high, CS low + self.dfi = dfi_phase + + def set(self, cmd): + ops = [] + for i, description in enumerate(self.TRUTH_TABLE[cmd]): + for j, bit in enumerate(description.split()): + ops.append(self.ca[i][j].eq(self.parse_bit(bit, is_mpc=cmd == "MPC"))) + if cmd != "DESELECT": + ops.append(self.cs[0].eq(1)) + return ops + + def parse_bit(self, bit, is_mpc=False): + rules = { + "H": lambda: 1, # high + "L": lambda: 0, # low + "V": lambda: 0, # defined logic + "X": lambda: 0, # don't care + "BL": lambda: 0, # on-the-fly burst length, not using + "AP": lambda: self.dfi.address[10], # auto precharge + "AB": lambda: self.dfi.address[10], # all banks + "BA(\d+)": lambda i: self.dfi.bank[i], + "R(\d+)": lambda i: self.dfi.address[i], # row + "C(\d+)": lambda i: self.dfi.address[i], # column + "MA(\d+)": lambda i: self.dfi.address[8+i], # mode register address + "OP(\d+)": lambda i: self.dfi.address[i], # mode register value, or operand for MPC + } + for pattern, value in rules.items(): + m = re.match(pattern, bit) + if m: + args = [int(g) for g in m.groups()] + return value(*args) + raise ValueError(bit) diff --git a/litedram/phy/s7lpddr4phy.py b/litedram/phy/lpddr4/s7phy.py similarity index 98% rename from litedram/phy/s7lpddr4phy.py rename to litedram/phy/lpddr4/s7phy.py index 128c11f..e651c36 100644 --- a/litedram/phy/s7lpddr4phy.py +++ b/litedram/phy/lpddr4/s7phy.py @@ -4,8 +4,12 @@ from litex.soc.interconnect.csr import * from litedram.common import * from litedram.phy.dfi import * -from litedram.phy.lpddr4phy import LPDDR4PHY, delayed +from litedram.phy.lpddr4.utils import delayed +from litedram.phy.lpddr4.basephy import LPDDR4PHY + + +# TODO: add option to avoid ODELAYE2, for now it won't work on Artix7 class S7LPDDR4PHY(LPDDR4PHY): def __init__(self, pads, *, iodelay_clk_freq, **kwargs): self.iodelay_clk_freq = iodelay_clk_freq diff --git a/litedram/phy/lpddr4/simphy.py b/litedram/phy/lpddr4/simphy.py new file mode 100644 index 0000000..8258589 --- /dev/null +++ b/litedram/phy/lpddr4/simphy.py @@ -0,0 +1,192 @@ +from migen import * + +from litedram.phy.lpddr4.utils import delayed +from litedram.phy.lpddr4.basephy import LPDDR4PHY + + +class LPDDR4SimulationPads(Module): + def __init__(self, databits=16): + self.clk_p = Signal() + self.clk_n = Signal() + self.cke = Signal() + self.odt = Signal() + self.reset_n = Signal() + self.cs = Signal() + self.ca = Signal(6) + # signals for checking actual tristate lines state (PHY reads these) + self.dq = Signal(databits) + self.dqs = Signal(databits//8) + self.dmi = Signal(databits//8) + # internal tristates i/o that should be driven for simulation + self.dq_o = Signal(databits) # PHY drives these + self.dq_i = Signal(databits) # DRAM chip (simulator) drives these + self.dq_oe = Signal() # PHY drives these + self.dqs_o = Signal(databits//8) + self.dqs_i = Signal(databits//8) + self.dqs_oe = Signal() + self.dmi_o = Signal(databits//8) + self.dmi_i = Signal(databits//8) + self.dmi_oe = Signal() + + self.comb += [ + If(self.dq_oe, self.dq.eq(self.dq_o)).Else(self.dq.eq(self.dq_i)), + If(self.dqs_oe, self.dqs.eq(self.dqs_o)).Else(self.dqs.eq(self.dqs_i)), + If(self.dmi_oe, self.dmi.eq(self.dmi_o)).Else(self.dmi.eq(self.dmi_i)), + ] + + +class LPDDR4SimPHY(LPDDR4PHY): + def __init__(self, sys_clk_freq=100e6, aligned_reset_zero=False): + pads = LPDDR4SimulationPads() + self.submodules += pads + super().__init__(pads, + sys_clk_freq = sys_clk_freq, + write_ser_latency = Serializer.LATENCY, + read_des_latency = Deserializer.LATENCY, + phytype = "LPDDR4SimPHY") + + def add_reset_value(phase, kwargs): + if aligned_reset_zero and phase == 0: + kwargs["reset_value"] = 0 + + # Serialization + def serialize(**kwargs): + name = 'ser_' + kwargs.pop('name', '') + ser = Serializer(o_dw=1, name=name.strip('_'), **kwargs) + self.submodules += ser + + def deserialize(**kwargs): + name = 'des_' + kwargs.pop('name', '') + des = Deserializer(i_dw=1, name=name.strip('_'), **kwargs) + self.submodules += des + + def ser_sdr(phase=0, **kwargs): + clkdiv = {0: "sys8x", 90: "sys8x_90"}[phase] + # clk = {0: "sys", 90: "sys_11_25"}[phase] + clk = {0: "sys", 90: "sys"}[phase] + add_reset_value(phase, kwargs) + serialize(clk=clk, clkdiv=clkdiv, i_dw=8, **kwargs) + + def ser_ddr(phase=0, **kwargs): + # for simulation we require sys8x_ddr clock (=sys16x) + clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase] + # clk = {0: "sys", 90: "sys_11_25"}[phase] + clk = {0: "sys", 90: "sys"}[phase] + add_reset_value(phase, kwargs) + serialize(clk=clk, clkdiv=clkdiv, i_dw=16, **kwargs) + + def des_ddr(phase=0, **kwargs): + clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase] + clk = {0: "sys", 90: "sys_11_25"}[phase] + add_reset_value(phase, kwargs) + deserialize(clk=clk, clkdiv=clkdiv, o_dw=16, **kwargs) + + # Clock is shifted 180 degrees to get rising edge in the middle of SDR signals. + # To achieve that we send negated clock on clk_p and non-negated on clk_n. + ser_ddr(i=~self.ck_clk, o=self.pads.clk_p, name='clk_p') + ser_ddr(i=self.ck_clk, o=self.pads.clk_n, name='clk_n') + + ser_sdr(i=self.ck_cke, o=self.pads.cke, name='cke') + ser_sdr(i=self.ck_odt, o=self.pads.odt, name='odt') + ser_sdr(i=self.ck_reset_n, o=self.pads.reset_n, name='reset_n') + + # Command/address + ser_sdr(i=self.ck_cs, o=self.pads.cs, name='cs') + for i in range(6): + ser_sdr(i=self.ck_ca[i], o=self.pads.ca[i], name=f'ca{i}') + + # Tristate I/O (separate for simulation) + for i in range(self.databits//8): + ser_ddr(i=self.ck_dmi_o[i], o=self.pads.dmi_o[i], name=f'dmi_o{i}') + des_ddr(o=self.ck_dmi_i[i], i=self.pads.dmi[i], name=f'dmi_i{i}') + ser_ddr(i=self.ck_dqs_o[i], o=self.pads.dqs_o[i], name=f'dqs_o{i}', phase=90) + des_ddr(o=self.ck_dqs_i[i], i=self.pads.dqs[i], name=f'dqs_i{i}', phase=90) + for i in range(self.databits): + ser_ddr(i=self.ck_dq_o[i], o=self.pads.dq_o[i], name=f'dq_o{i}') + des_ddr(o=self.ck_dq_i[i], i=self.pads.dq[i], name=f'dq_i{i}') + # Output enable signals + self.comb += self.pads.dmi_oe.eq(delayed(self, self.dmi_oe, cycles=Serializer.LATENCY)) + self.comb += self.pads.dqs_oe.eq(delayed(self, self.dqs_oe, cycles=Serializer.LATENCY)) + self.comb += self.pads.dq_oe.eq(delayed(self, self.dq_oe, cycles=Serializer.LATENCY)) + + +class Serializer(Module): + """Serialize given input signal + + It latches the input data on the rising edge of `clk`. Output data counter `cnt` is incremented + on rising edges of `clkdiv` and it determines current slice of `i` that is presented on `o`. + `latency` is specified in `clk` cycles. + + NOTE: both `clk` and `clkdiv` should be phase aligned. + NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0 + """ + LATENCY = 1 + + def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None): + assert i_dw > o_dw + assert i_dw % o_dw == 0 + ratio = i_dw // o_dw + + sd_clk = getattr(self.sync, clk) + sd_clkdiv = getattr(self.sync, clkdiv) + + if i is None: i = Signal(i_dw) + if o is None: o = Signal(o_dw) + if reset is None: reset = Signal() + + self.i = i + self.o = o + self.reset = reset + + if reset_value < 0: + reset_value = ratio + reset_value + + cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None) + sd_clkdiv += If(reset | cnt == ratio - 1, cnt.eq(0)).Else(cnt.eq(cnt + 1)) + + i_d = Signal.like(self.i) + sd_clk += i_d.eq(self.i) + i_array = Array([i_d[n*o_dw:(n+1)*o_dw] for n in range(ratio)]) + self.comb += self.o.eq(i_array[cnt]) + + +class Deserializer(Module): + """Deserialize given input signal + + Latches the input data on the rising edges of `clkdiv` and stores them in the `o_pre` buffer. + Additional latency cycle is used to ensure that the last input bit is deserialized correctly. + + NOTE: both `clk` and `clkdiv` should be phase aligned. + NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0 + """ + LATENCY = 2 + + def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None): + assert i_dw < o_dw + assert o_dw % i_dw == 0 + ratio = o_dw // i_dw + + sd_clk = getattr(self.sync, clk) + sd_clkdiv = getattr(self.sync, clkdiv) + + if i is None: i = Signal(i_dw) + if o is None: o = Signal(o_dw) + if reset is None: reset = Signal() + + self.i = i + self.o = o + self.reset = reset + + if reset_value < 0: + reset_value = ratio + reset_value + + cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None) + sd_clkdiv += If(reset, cnt.eq(0)).Else(cnt.eq(cnt + 1)) + + o_pre = Signal.like(self.o) + o_array = Array([o_pre[n*i_dw:(n+1)*i_dw] for n in range(ratio)]) + sd_clkdiv += o_array[cnt].eq(self.i) + # we need to ensure that the last bit will be correct if clocks are phase aligned + o_pre_d = Signal.like(self.o) + sd_clk += o_pre_d.eq(o_pre) + sd_clk += self.o.eq(Cat(o_pre_d[:-1], o_pre[-1])) # would work as self.comb (at least in simulation) diff --git a/litedram/phy/lpddr4/utils.py b/litedram/phy/lpddr4/utils.py new file mode 100644 index 0000000..2ef092e --- /dev/null +++ b/litedram/phy/lpddr4/utils.py @@ -0,0 +1,71 @@ +from functools import reduce +from operator import or_ + +from migen import * + +from litedram.common import TappedDelayLine + + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +def bitpattern(s): + if len(s) > 8: + return reduce(or_, [bitpattern(si) << (8*i) for i, si in enumerate(chunks(s, 8))]) + assert len(s) == 8 + s = s.translate(s.maketrans("_-", "01")) + return int(s[::-1], 2) # LSB first, so reverse the string + +def delayed(mod, sig, cycles=1): + delay = TappedDelayLine(signal=sig, ntaps=cycles) + mod.submodules += delay + return delay.output + +class ConstBitSlip(Module): + def __init__(self, dw, i=None, o=None, slp=None, cycles=1): + self.i = Signal(dw, name='i') if i is None else i + self.o = Signal(dw, name='o') if o is None else o + assert cycles >= 1 + assert 0 <= slp <= cycles*dw-1 + slp = (cycles*dw-1) - slp + + # # # + + self.r = r = Signal((cycles+1)*dw, reset_less=True) + self.sync += r.eq(Cat(r[dw:], self.i)) + cases = {} + for i in range(cycles*dw): + cases[i] = self.o.eq(r[i+1:dw+i+1]) + self.comb += Case(slp, cases) + +# TODO: rewrite DQSPattern in litedram/common.py to support different data widths +class DQSPattern(Module): + def __init__(self, preamble=None, postamble=None, wlevel_en=0, wlevel_strobe=0, register=False): + self.preamble = Signal() if preamble is None else preamble + self.postamble = Signal() if postamble is None else postamble + self.o = Signal(16) + + # # # + + # DQS Pattern transmitted as LSB-first. + + self.comb += [ + self.o.eq(0b0101010101010101), + If(self.preamble, + self.o.eq(0b0001010101010101) + ), + If(self.postamble, + self.o.eq(0b0101010101010100) + ), + If(wlevel_en, + self.o.eq(0b0000000000000000), + If(wlevel_strobe, + self.o.eq(0b0000000000000001) + ) + ) + ] + if register: + o = Signal.like(self.o) + self.sync += o.eq(self.o) + self.o = o diff --git a/litedram/phy/lpddr4phy.py b/litedram/phy/lpddr4phy.py deleted file mode 100644 index dd7160b..0000000 --- a/litedram/phy/lpddr4phy.py +++ /dev/null @@ -1,695 +0,0 @@ -import re -from functools import reduce -from operator import or_ -from collections import defaultdict - -import math - -from migen import * - -from litex.soc.interconnect.csr import * - -from litedram.common import * -from litedram.phy.dfi import * - - -def _chunks(lst, n): - for i in range(0, len(lst), n): - yield lst[i:i + n] - -def bitpattern(s): - if len(s) > 8: - return reduce(or_, [bitpattern(si) << (8*i) for i, si in enumerate(_chunks(s, 8))]) - assert len(s) == 8 - s = s.translate(s.maketrans("_-", "01")) - return int(s[::-1], 2) # LSB first, so reverse the string - -def delayed(mod, sig, cycles=1): - delay = TappedDelayLine(signal=sig, ntaps=cycles) - mod.submodules += delay - return delay.output - -class ConstBitSlip(Module): - def __init__(self, dw, i=None, o=None, slp=None, cycles=1): - self.i = Signal(dw, name='i') if i is None else i - self.o = Signal(dw, name='o') if o is None else o - assert cycles >= 1 - assert 0 <= slp <= cycles*dw-1 - slp = (cycles*dw-1) - slp - - # # # - - self.r = r = Signal((cycles+1)*dw, reset_less=True) - self.sync += r.eq(Cat(r[dw:], self.i)) - cases = {} - for i in range(cycles*dw): - cases[i] = self.o.eq(r[i+1:dw+i+1]) - self.comb += Case(slp, cases) - -# TODO: rewrite DQSPattern in common.py to support different data widths -class DQSPattern(Module): - def __init__(self, preamble=None, postamble=None, wlevel_en=0, wlevel_strobe=0, register=False): - self.preamble = Signal() if preamble is None else preamble - self.postamble = Signal() if postamble is None else postamble - self.o = Signal(16) - - # # # - - # DQS Pattern transmitted as LSB-first. - - self.comb += [ - self.o.eq(0b0101010101010101), - If(self.preamble, - self.o.eq(0b0001010101010101) - ), - If(self.postamble, - self.o.eq(0b0101010101010100) - ), - If(wlevel_en, - self.o.eq(0b0000000000000000), - If(wlevel_strobe, - self.o.eq(0b0000000000000001) - ) - ) - ] - if register: - o = Signal.like(self.o) - self.sync += o.eq(self.o) - self.o = o - -# LPDDR4PHY ---------------------------------------------------------------------------------------- - -class LPDDR4PHY(Module, AutoCSR): - def __init__(self, pads, *, - sys_clk_freq, write_ser_latency, read_des_latency, phytype, cmd_delay=None): - self.pads = pads - self.memtype = memtype = "LPDDR4" - self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len(pads.cs_n) - self.databits = databits = len(pads.dq) - self.addressbits = addressbits = 17 # for activate row address - self.bankbits = bankbits = 3 - self.nphases = nphases = 8 - self.tck = tck = 1 / (nphases*sys_clk_freq) - assert databits % 8 == 0 - - # Parameters ------------------------------------------------------------------------------- - def get_cl_cw(memtype, tck): - # MT53E256M16D1, No DBI, Set A - f_to_cl_cwl = OrderedDict() - f_to_cl_cwl[ 532e6] = ( 6, 4) # FIXME: with that low cwl, wrtap is 0 - f_to_cl_cwl[1066e6] = (10, 6) - f_to_cl_cwl[1600e6] = (14, 8) - f_to_cl_cwl[2132e6] = (20, 10) - f_to_cl_cwl[2666e6] = (24, 12) - f_to_cl_cwl[3200e6] = (28, 14) - f_to_cl_cwl[3732e6] = (32, 16) - f_to_cl_cwl[4266e6] = (36, 18) - for f, (cl, cwl) in f_to_cl_cwl.items(): - if tck >= 2/f: - return cl, cwl - raise ValueError - - # Bitslip introduces latency between from `cycles` up to `cycles + 1` - bitslip_cycles = 1 - # Commands are sent over 4 cycles of DRAM clock (sys8x) - cmd_latency = 4 - # Commands read from adapters are delayed on ConstBitSlips - ca_latency = 1 - - cl, cwl = get_cl_cw(memtype, tck) - cl_sys_latency = get_sys_latency(nphases, cl) - cwl_sys_latency = get_sys_latency(nphases, cwl) - rdphase = get_sys_phase(nphases, cl_sys_latency, cl + cmd_latency) - wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency) - - # When the calculated phase is negative, it means that we need to increase sys latency - def updated_latency(phase): - delay_update = 0 - while phase < 0: - phase += nphases - delay_update += 1 - return phase, delay_update - - wrphase, cwl_sys_delay = updated_latency(wrphase) - rdphase, cl_sys_delay = updated_latency(rdphase) - cwl_sys_latency += cwl_sys_delay - cl_sys_latency += cl_sys_delay - - # Read latency - read_data_delay = ca_latency + write_ser_latency + cl_sys_latency # DFI cmd -> read data on DQ - read_des_delay = read_des_latency + bitslip_cycles # data on DQ -> data on DFI rddata - read_latency = read_data_delay + read_des_delay - - # Write latency - write_latency = cwl_sys_latency - - # FIXME: remove - if __import__("os").environ.get("DEBUG") == '1': - print('cl', end=' = '); __import__('pprint').pprint(cl) - print('cwl', end=' = '); __import__('pprint').pprint(cwl) - print('cl_sys_latency', end=' = '); __import__('pprint').pprint(cl_sys_latency) - print('cwl_sys_latency', end=' = '); __import__('pprint').pprint(cwl_sys_latency) - print('rdphase', end=' = '); __import__('pprint').pprint(rdphase) - print('wrphase', end=' = '); __import__('pprint').pprint(wrphase) - print('read_data_delay', end=' = '); __import__('pprint').pprint(read_data_delay) - print('read_des_delay', end=' = '); __import__('pprint').pprint(read_des_delay) - print('read_latency', end=' = '); __import__('pprint').pprint(read_latency) - print('write_latency', end=' = '); __import__('pprint').pprint(write_latency) - - # Registers -------------------------------------------------------------------------------- - self._rst = CSRStorage() - - self._dly_sel = CSRStorage(databits//8) - - self._wlevel_en = CSRStorage() - self._wlevel_strobe = CSR() - - self._dly_sel = CSRStorage(databits//8) - - self._rdly_dq_bitslip_rst = CSR() - self._rdly_dq_bitslip = CSR() - - self._wdly_dq_bitslip_rst = CSR() - self._wdly_dq_bitslip = CSR() - - self._rdphase = CSRStorage(int(math.log2(nphases)), reset=rdphase) - self._wrphase = CSRStorage(int(math.log2(nphases)), reset=wrphase) - - # PHY settings ----------------------------------------------------------------------------- - self.settings = PhySettings( - phytype = phytype, - memtype = memtype, - databits = databits, - dfi_databits = 2*databits, - nranks = nranks, - nphases = nphases, - rdphase = self._rdphase.storage, - wrphase = self._wrphase.storage, - cl = cl, - cwl = cwl, - read_latency = read_latency, - write_latency = write_latency, - cmd_latency = cmd_latency, - cmd_delay = cmd_delay, - ) - - # DFI Interface ---------------------------------------------------------------------------- - # Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a - # whole burst during a single controller clock cycle. PHY should use sys8x clock. - self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2*databits, nphases=8) - - # # # - - adapters = [DFIPhaseAdapter(phase) for phase in self.dfi.phases] - self.submodules += adapters - - # Now prepare the data by converting the sequences on adapters into sequences on the pads. - # We have to ignore overlapping commands, and module timings have to ensure that there are - # no overlapping commands anyway. - # Pads: reset_n, CS, CKE, CK, CA[5:0], DMI[1:0], DQ[15:0], DQS[1:0], ODT_CA - self.ck_clk = Signal(2*nphases) - self.ck_cke = Signal(nphases) - self.ck_odt = Signal(nphases) - self.ck_reset_n = Signal(nphases) - self.ck_cs = Signal(nphases) - self.ck_ca = [Signal(nphases) for _ in range(6)] - self.ck_dmi_o = [Signal(2*nphases) for _ in range(2)] - self.ck_dmi_i = [Signal(2*nphases) for _ in range(2)] - self.dmi_oe = Signal() - self.ck_dq_o = [Signal(2*nphases) for _ in range(databits)] - self.ck_dq_i = [Signal(2*nphases) for _ in range(databits)] - self.dq_oe = Signal() - self.ck_dqs_o = [Signal(2*nphases) for _ in range(2)] - self.ck_dqs_i = [Signal(2*nphases) for _ in range(2)] - self.dqs_oe = Signal() - - # Clocks ----------------------------------------------------------------------------------- - self.comb += self.ck_clk.eq(bitpattern("-_-_-_-_" * 2)) - - # Simple commands -------------------------------------------------------------------------- - self.comb += [ - self.ck_cke.eq(Cat(delayed(self, phase.cke) for phase in self.dfi.phases)), - self.ck_odt.eq(Cat(delayed(self, phase.odt) for phase in self.dfi.phases)), - self.ck_reset_n.eq(Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)), - ] - - # LPDDR4 Commands -------------------------------------------------------------------------- - # Each command can span several phases (up to 4), so we must ignore overlapping commands, - # but in general, module timings should be set in a way that overlapping will never happen. - - # Create a history of valid adapters used for masking overlapping ones. - # TODO: make optional, as it takes up resources and the controller should ensure no overlaps - valids = ConstBitSlip(dw=nphases, cycles=1, slp=0) - self.submodules += valids - self.comb += valids.i.eq(Cat(a.valid for a in adapters)) - # valids_hist = valids.r - valids_hist = Signal.like(valids.r) - # TODO: especially make this part optional - for i in range(len(valids_hist)): - was_valid_before = reduce(or_, valids_hist[max(0, i-3):i], 0) - self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before) - - cs_per_adapter = [] - ca_per_adapter = defaultdict(list) - for phase, adapter in enumerate(adapters): - # The signals from an adapter can be used if there were no commands on 3 previous cycles - allowed = ~reduce(or_, valids_hist[nphases+phase - 3:nphases+phase]) - - # Use CS and CA of given adapter slipped by `phase` bits - cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) - self.submodules += cs_bs - self.comb += cs_bs.i.eq(Cat(adapter.cs)), - cs_mask = Replicate(allowed, len(cs_bs.o)) - cs = cs_bs.o & cs_mask - cs_per_adapter.append(cs) - - # For CA we need to do the same for each bit - ca_bits = [] - for bit in range(6): - ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase) - self.submodules += ca_bs - ca_bit_hist = [adapter.ca[i][bit] for i in range(4)] - self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)), - ca_mask = Replicate(allowed, len(ca_bs.o)) - ca = ca_bs.o & ca_mask - ca_per_adapter[bit].append(ca) - - # OR all the masked signals - self.comb += self.ck_cs.eq(reduce(or_, cs_per_adapter)) - for bit in range(6): - self.comb += self.ck_ca[bit].eq(reduce(or_, ca_per_adapter[bit])) - - # DQ --------------------------------------------------------------------------------------- - dq_oe = Signal() - self.comb += self.dq_oe.eq(delayed(self, dq_oe, cycles=1)) - - for bit in range(self.databits): - # output - self.submodules += BitSlip( - dw = 2*nphases, - cycles = bitslip_cycles, - rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, - slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re, - i = Cat(*[self.dfi.phases[i//2].wrdata[i%2 * self.databits + bit] for i in range(2*nphases)]), - o = self.ck_dq_o[bit], - ) - - # input - dq_i_bs = Signal(2*nphases) - self.submodules += BitSlip( - dw = 2*nphases, - cycles = bitslip_cycles, - rst = (self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage, - slp = self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip.re, - i = self.ck_dq_i[bit], - o = dq_i_bs, - ) - for i in range(2*nphases): - self.comb += self.dfi.phases[i//2].rddata[i%2 * self.databits + bit].eq(dq_i_bs[i]) - - # DQS -------------------------------------------------------------------------------------- - dqs_oe = Signal() - dqs_preamble = Signal() - dqs_postamble = Signal() - dqs_pattern = DQSPattern( - preamble = dqs_preamble, # FIXME: are defined the opposite way (common.py) ??? - postamble = dqs_postamble, - wlevel_en = self._wlevel_en.storage, - wlevel_strobe = self._wlevel_strobe.re) - self.submodules += dqs_pattern - self.comb += [ - self.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)), - ] - - for bit in range(self.databits//8): - # output - self.submodules += BitSlip( - dw = 2*nphases, - cycles = bitslip_cycles, - rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, - slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re, - i = dqs_pattern.o, - o = self.ck_dqs_o[bit], - ) - - # DMI -------------------------------------------------------------------------------------- - # DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values. - # With DM and DBI disabled, this signal is a Don't Care. - # With DM enabled, masking is performed only when the command used is WRITE-MASKED. - # TODO: use WRITE-MASKED for all write commands, and configure Mode Registers for that - # during DRAM initialization (we don't want to support DBI). - for bin in range(self.databits//8): - self.comb += self.ck_dmi_o[bit].eq(0) - - # Read Control Path ------------------------------------------------------------------------ - # Creates a delay line of read commands coming from the DFI interface. The output is used to - # signal a valid read data to the DFI interface. - # - # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI - # interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies. - rddata_en = TappedDelayLine( - signal = reduce(or_, [dfi.phases[i].rddata_en for i in range(nphases)]), - ntaps = self.settings.read_latency - ) - self.submodules += rddata_en - - self.comb += [phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases] - - # Write Control Path ----------------------------------------------------------------------- - wrtap = cwl_sys_latency - 1 - assert wrtap >= 1 - - # Create a delay line of write commands coming from the DFI interface. This taps are used to - # control DQ/DQS tristates. - wrdata_en = TappedDelayLine( - signal = reduce(or_, [dfi.phases[i].wrdata_en for i in range(nphases)]), - ntaps = wrtap + 2 - ) - self.submodules += wrdata_en - - self.comb += dq_oe.eq(wrdata_en.taps[wrtap]) - self.comb += If(self._wlevel_en.storage, dqs_oe.eq(1)).Else(dqs_oe.eq(dqs_preamble | dq_oe | dqs_postamble)) - - # Write DQS Postamble/Preamble Control Path ------------------------------------------------ - # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last - # write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles: - # 1 for Preamble, 1 for the Write and 1 for the Postamble. - self.comb += dqs_preamble.eq( wrdata_en.taps[wrtap - 1] & ~wrdata_en.taps[wrtap + 0]) - self.comb += dqs_postamble.eq(wrdata_en.taps[wrtap + 1] & ~wrdata_en.taps[wrtap + 0]) - -class DFIPhaseAdapter(Module): - # We must perform mapping of DFI commands to the LPDDR4 commands set on CA bus. - # LPDDR4 "small command" consists of 2 words CA[5:0] sent on the bus in 2 subsequent - # cycles. First cycle is marked with CS high, second with CS low. - # Then most "big commands" consist of 2 "small commands" (e.g. ACTIVATE-1, ACTIVATE-2). - # If a command uses 1 "small command", then it shall go as cmd2 so that all command - # timings can be counted from the same moment (cycle of cmd2 CS low). - - # MPC (multipurpose command) can be used to perform different actions - # We use ZQC with BA=0 to issue MPC, where OP[6:0] = A[6:0] - MPC = { - "NOP": 0b0000000, # only OP[6] must be 0 - "READ-FIFO": 0b1000001, - "READ-DQ-CAL": 0b1000011, - # RFU: 0b1000101 - "WRITE-FIFO": 0b1000111, - # RFU: 0b1001001 - "START-DQS-OSC": 0b1001011, - "STOP-DQS-OSC": 0b1001101, - "ZQC-START": 0b1001111, - "ZQC-LATCH": 0b1010001, - } - - def __init__(self, dfi_phase): - # CS/CA values for 4 SDR cycles - self.cs = Signal(4) - self.ca = Array([Signal(6) for _ in range(4)]) - self.valid = Signal() - - # # # - - self.submodules.cmd1 = Command(dfi_phase) - self.submodules.cmd2 = Command(dfi_phase) - self.comb += [ - self.cs[:2].eq(self.cmd1.cs), - self.cs[2:].eq(self.cmd2.cs), - self.ca[0].eq(self.cmd1.ca[0]), - self.ca[1].eq(self.cmd1.ca[1]), - self.ca[2].eq(self.cmd2.ca[0]), - self.ca[3].eq(self.cmd2.ca[1]), - ] - - dfi_cmd = Signal(3) - self.comb += dfi_cmd.eq(Cat(~dfi_phase.we_n, ~dfi_phase.ras_n, ~dfi_phase.cas_n)), - _cmd = { # cas, ras, we - "NOP": 0b000, - "ACT": 0b010, - "RD": 0b100, - "WR": 0b101, - "PRE": 0b011, - "REF": 0b110, - "ZQC": 0b001, - "MRS": 0b111, - } - - def cmds(cmd1, cmd2, valid=1): - return self.cmd1.set(cmd1) + self.cmd2.set(cmd2) + [self.valid.eq(valid)] - - self.comb += If(dfi_phase.cs_n == 0, # require dfi.cs_n - Case(dfi_cmd, { - _cmd["ACT"]: cmds("ACTIVATE-1", "ACTIVATE-2"), - _cmd["RD"]: cmds("READ-1", "CAS-2"), - _cmd["WR"]: cmds("WRITE-1", "CAS-2"), # TODO: masked write - _cmd["PRE"]: cmds("DESELECT", "PRECHARGE"), - _cmd["REF"]: cmds("DESELECT", "REFRESH"), - _cmd["ZQC"]: cmds("DESELECT", "MPC"), - _cmd["MRS"]: cmds("MRW-1", "MRW-2"), - "default": cmds("DESELECT", "DESELECT", valid=0), - }) - ) - -class Command(Module): - # String description of 1st and 2nd edge of each command, later parsed to construct - # the value. CS is assumed to be H for 1st edge and L for 2nd edge. - TRUTH_TABLE = { - "MRW-1": ["L H H L L OP7", "MA0 MA1 MA2 MA3 MA4 MA5"], - "MRW-2": ["L H H L H OP6", "OP0 OP1 OP2 OP3 OP4 OP5"], - "MRR-1": ["L H H H L V", "MA0 MA1 MA2 MA3 MA4 MA5"], - "REFRESH": ["L L L H L AB", "BA0 BA1 BA2 V V V"], - "ACTIVATE-1": ["H L R12 R13 R14 R15", "BA0 BA1 BA2 R16 R10 R11"], - "ACTIVATE-2": ["H H R6 R7 R8 R9", "R0 R1 R2 R3 R4 R5"], - "WRITE-1": ["L L H L L BL", "BA0 BA1 BA2 V C9 AP"], - "MASK WRITE-1": ["L L H H L BL", "BA0 BA1 BA2 V C9 AP"], - "READ-1": ["L H L L L BL", "BA0 BA1 BA2 V C9 AP"], - "CAS-2": ["L H L L H C8", "C2 C3 C4 C5 C6 C7"], - "PRECHARGE": ["L L L L H AB", "BA0 BA1 BA2 V V V"], - "MPC": ["L L L L L OP6", "OP0 OP1 OP2 OP3 OP4 OP5"], - "DESELECT": ["X X X X X X", "X X X X X X"], - } - - for cmd, (subcmd1, subcmd2) in TRUTH_TABLE.items(): - assert len(subcmd1.split()) == 6, (cmd, subcmd1) - assert len(subcmd2.split()) == 6, (cmd, subcmd2) - - def __init__(self, dfi_phase): - self.cs = Signal(2) - self.ca = Array([Signal(6), Signal(6)]) # CS high, CS low - self.dfi = dfi_phase - - def set(self, cmd): - ops = [] - for i, description in enumerate(self.TRUTH_TABLE[cmd]): - for j, bit in enumerate(description.split()): - ops.append(self.ca[i][j].eq(self.parse_bit(bit, is_mpc=cmd == "MPC"))) - if cmd != "DESELECT": - ops.append(self.cs[0].eq(1)) - return ops - - def parse_bit(self, bit, is_mpc=False): - rules = { - "H": lambda: 1, # high - "L": lambda: 0, # low - "V": lambda: 0, # defined logic - "X": lambda: 0, # don't care - "BL": lambda: 0, # on-the-fly burst length, not using - "AP": lambda: self.dfi.address[10], # auto precharge - "AB": lambda: self.dfi.address[10], # all banks - "BA(\d+)": lambda i: self.dfi.bank[i], - "R(\d+)": lambda i: self.dfi.address[i], # row - "C(\d+)": lambda i: self.dfi.address[i], # column - "MA(\d+)": lambda i: self.dfi.address[8+i], # mode register address - "OP(\d+)": lambda i: self.dfi.address[i], # mode register value, or operand for MPC - } - for pattern, value in rules.items(): - m = re.match(pattern, bit) - if m: - args = [int(g) for g in m.groups()] - return value(*args) - raise ValueError(bit) - -# SimulationPHY ------------------------------------------------------------------------------------ - -class LPDDR4SimulationPads(Module): - def __init__(self, databits=16): - self.clk_p = Signal() - self.clk_n = Signal() - self.cke = Signal() - self.odt = Signal() - self.reset_n = Signal() - self.cs = Signal() - self.ca = Signal(6) - # signals for checking actual tristate lines state (PHY reads these) - self.dq = Signal(databits) - self.dqs = Signal(databits//8) - self.dmi = Signal(databits//8) - # internal tristates i/o that should be driven for simulation - self.dq_o = Signal(databits) # PHY drives these - self.dq_i = Signal(databits) # DRAM chip (simulator) drives these - self.dq_oe = Signal() # PHY drives these - self.dqs_o = Signal(databits//8) - self.dqs_i = Signal(databits//8) - self.dqs_oe = Signal() - self.dmi_o = Signal(databits//8) - self.dmi_i = Signal(databits//8) - self.dmi_oe = Signal() - - self.comb += [ - If(self.dq_oe, self.dq.eq(self.dq_o)).Else(self.dq.eq(self.dq_i)), - If(self.dqs_oe, self.dqs.eq(self.dqs_o)).Else(self.dqs.eq(self.dqs_i)), - If(self.dmi_oe, self.dmi.eq(self.dmi_o)).Else(self.dmi.eq(self.dmi_i)), - ] - - -class SimulationPHY(LPDDR4PHY): - def __init__(self, sys_clk_freq=100e6, aligned_reset_zero=False): - pads = LPDDR4SimulationPads() - self.submodules += pads - super().__init__(pads, - sys_clk_freq = sys_clk_freq, - write_ser_latency = Serializer.LATENCY, - read_des_latency = Deserializer.LATENCY, - phytype = "SimulationPHY") - - def add_reset_value(phase, kwargs): - if aligned_reset_zero and phase == 0: - kwargs["reset_value"] = 0 - - # Serialization - def serialize(**kwargs): - name = 'ser_' + kwargs.pop('name', '') - ser = Serializer(o_dw=1, name=name.strip('_'), **kwargs) - self.submodules += ser - - def deserialize(**kwargs): - name = 'des_' + kwargs.pop('name', '') - des = Deserializer(i_dw=1, name=name.strip('_'), **kwargs) - self.submodules += des - - def ser_sdr(phase=0, **kwargs): - clkdiv = {0: "sys8x", 90: "sys8x_90"}[phase] - # clk = {0: "sys", 90: "sys_11_25"}[phase] - clk = {0: "sys", 90: "sys"}[phase] - add_reset_value(phase, kwargs) - serialize(clk=clk, clkdiv=clkdiv, i_dw=8, **kwargs) - - def ser_ddr(phase=0, **kwargs): - # for simulation we require sys8x_ddr clock (=sys16x) - clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase] - # clk = {0: "sys", 90: "sys_11_25"}[phase] - clk = {0: "sys", 90: "sys"}[phase] - add_reset_value(phase, kwargs) - serialize(clk=clk, clkdiv=clkdiv, i_dw=16, **kwargs) - - def des_ddr(phase=0, **kwargs): - clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase] - clk = {0: "sys", 90: "sys_11_25"}[phase] - add_reset_value(phase, kwargs) - deserialize(clk=clk, clkdiv=clkdiv, o_dw=16, **kwargs) - - # Clock is shifted 180 degrees to get rising edge in the middle of SDR signals. - # To achieve that we send negated clock on clk_p and non-negated on clk_n. - ser_ddr(i=~self.ck_clk, o=self.pads.clk_p, name='clk_p') - ser_ddr(i=self.ck_clk, o=self.pads.clk_n, name='clk_n') - - ser_sdr(i=self.ck_cke, o=self.pads.cke, name='cke') - ser_sdr(i=self.ck_odt, o=self.pads.odt, name='odt') - ser_sdr(i=self.ck_reset_n, o=self.pads.reset_n, name='reset_n') - - # Command/address - ser_sdr(i=self.ck_cs, o=self.pads.cs, name='cs') - for i in range(6): - ser_sdr(i=self.ck_ca[i], o=self.pads.ca[i], name=f'ca{i}') - - # Tristate I/O (separate for simulation) - for i in range(self.databits//8): - ser_ddr(i=self.ck_dmi_o[i], o=self.pads.dmi_o[i], name=f'dmi_o{i}') - des_ddr(o=self.ck_dmi_i[i], i=self.pads.dmi[i], name=f'dmi_i{i}') - ser_ddr(i=self.ck_dqs_o[i], o=self.pads.dqs_o[i], name=f'dqs_o{i}', phase=90) - des_ddr(o=self.ck_dqs_i[i], i=self.pads.dqs[i], name=f'dqs_i{i}', phase=90) - for i in range(self.databits): - ser_ddr(i=self.ck_dq_o[i], o=self.pads.dq_o[i], name=f'dq_o{i}') - des_ddr(o=self.ck_dq_i[i], i=self.pads.dq[i], name=f'dq_i{i}') - # Output enable signals - self.comb += self.pads.dmi_oe.eq(delayed(self, self.dmi_oe, cycles=Serializer.LATENCY)) - self.comb += self.pads.dqs_oe.eq(delayed(self, self.dqs_oe, cycles=Serializer.LATENCY)) - self.comb += self.pads.dq_oe.eq(delayed(self, self.dq_oe, cycles=Serializer.LATENCY)) - -class Serializer(Module): - """Serialize given input signal - - It latches the input data on the rising edge of `clk`. Output data counter `cnt` is incremented - on rising edges of `clkdiv` and it determines current slice of `i` that is presented on `o`. - `latency` is specified in `clk` cycles. - - NOTE: both `clk` and `clkdiv` should be phase aligned. - NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0 - """ - LATENCY = 1 - - def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None): - assert i_dw > o_dw - assert i_dw % o_dw == 0 - ratio = i_dw // o_dw - - sd_clk = getattr(self.sync, clk) - sd_clkdiv = getattr(self.sync, clkdiv) - - if i is None: i = Signal(i_dw) - if o is None: o = Signal(o_dw) - if reset is None: reset = Signal() - - self.i = i - self.o = o - self.reset = reset - - if reset_value < 0: - reset_value = ratio + reset_value - - cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None) - sd_clkdiv += If(reset | cnt == ratio - 1, cnt.eq(0)).Else(cnt.eq(cnt + 1)) - - i_d = Signal.like(self.i) - sd_clk += i_d.eq(self.i) - i_array = Array([i_d[n*o_dw:(n+1)*o_dw] for n in range(ratio)]) - self.comb += self.o.eq(i_array[cnt]) - -class Deserializer(Module): - """Deserialize given input signal - - Latches the input data on the rising edges of `clkdiv` and stores them in the `o_pre` buffer. - Additional latency cycle is used to ensure that the last input bit is deserialized correctly. - - NOTE: both `clk` and `clkdiv` should be phase aligned. - NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0 - """ - LATENCY = 2 - - def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None): - assert i_dw < o_dw - assert o_dw % i_dw == 0 - ratio = o_dw // i_dw - - sd_clk = getattr(self.sync, clk) - sd_clkdiv = getattr(self.sync, clkdiv) - - if i is None: i = Signal(i_dw) - if o is None: o = Signal(o_dw) - if reset is None: reset = Signal() - - self.i = i - self.o = o - self.reset = reset - - if reset_value < 0: - reset_value = ratio + reset_value - - cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None) - sd_clkdiv += If(reset, cnt.eq(0)).Else(cnt.eq(cnt + 1)) - - o_pre = Signal.like(self.o) - o_array = Array([o_pre[n*i_dw:(n+1)*i_dw] for n in range(ratio)]) - sd_clkdiv += o_array[cnt].eq(self.i) - # we need to ensure that the last bit will be correct if clocks are phase aligned - o_pre_d = Signal.like(self.o) - sd_clk += o_pre_d.eq(o_pre) - sd_clk += self.o.eq(Cat(o_pre_d[:-1], o_pre[-1])) # would work as self.comb (at least in simulation) diff --git a/test/test_lpddr4.py b/test/test_lpddr4.py index 2cf64db..a8aa348 100644 --- a/test/test_lpddr4.py +++ b/test/test_lpddr4.py @@ -10,7 +10,7 @@ from typing import Mapping, Sequence from migen import * from litedram.phy import dfi -from litedram.phy.lpddr4phy import SimulationPHY, Serializer, Deserializer +from litedram.phy.lpddr4.simphy import LPDDR4SimPHY, Serializer, Deserializer from litex.gen.sim import run_simulation as _run_simulation @@ -422,7 +422,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_cs_phase_0(self): # Test that CS is serialized correctly when sending command on phase 0 latency = '00000000' * self.CMD_LATENCY - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)}, # p0: READ ], @@ -434,7 +434,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_clk(self): # Test clock serialization, first few cycles are undefined so ignore them latency = 'xxxxxxxx' * self.CMD_LATENCY - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {3: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)}, ], @@ -446,7 +446,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_cs_multiple_phases(self): # Test that CS is serialized on different phases and that overlapping commands are handled latency = '00000000' * self.CMD_LATENCY - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)}, {3: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)}, @@ -479,7 +479,7 @@ class TestLPDDR4(unittest.TestCase): # Test proper serialization of commands to CA pads and that overlapping commands are handled latency = '00000000' * self.CMD_LATENCY read = dict(cs_n=0, cas_n=0, ras_n=1, we_n=1) - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: read, 3: read}, # p4 should be ignored {0: read, 4: read}, @@ -508,7 +508,7 @@ class TestLPDDR4(unittest.TestCase): mrw = dict(cs_n=0, cas_n=0, ras_n=0, we_n=0, bank=0, address=(0b110011 << 8) | 0b10101010) # 6-bit address | 8-bit op code zqc_start = dict(cs_n=0, cas_n=1, ras_n=1, we_n=0, bank=0, address=0b1001111) # MPC with ZQCAL START operand zqc_latch = dict(cs_n=0, cas_n=1, ras_n=1, we_n=0, bank=0, address=0b1010001) # MPC with ZQCAL LATCH operand - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: read, 4: write_ap}, {0: activate, 4: refresh_ab}, @@ -532,7 +532,7 @@ class TestLPDDR4(unittest.TestCase): # Test serialization of DFI command pins (cs/cke/odt/reset_n) latency = '00000000' * self.CMD_LATENCY read = dict(cs_n=0, cas_n=0, ras_n=1, we_n=1) - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ { 0: dict(cke=1, odt=1, reset_n=1, **read), @@ -552,7 +552,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_dq_out(self): # Test serialization of dfi wrdata to DQ pads - dut = SimulationPHY() + dut = LPDDR4SimPHY() zero = '00000000' * 2 # zero for 1 sysclk clock in sys8x_ddr clock domain dfi_data = { @@ -576,7 +576,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_dq_only_1cycle(self): # Test that DQ data is sent to pads only during expected cycle, on other cycles there is no data - dut = SimulationPHY() + dut = LPDDR4SimPHY() zero = '00000000' * 2 dfi_data = { @@ -603,7 +603,7 @@ class TestLPDDR4(unittest.TestCase): # Test serialization of DQS pattern in relation to DQ data, with proper preamble and postamble zero = '00000000' * 2 - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: dict(wrdata_en=1)}, {}, @@ -634,7 +634,7 @@ class TestLPDDR4(unittest.TestCase): # Test proper output on DMI pads. We don't implement masking now, so nothing should be sent to DMI pads zero = '00000000' * 2 - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = [ {0: dict(wrdata_en=1)}, {}, @@ -670,7 +670,7 @@ class TestLPDDR4(unittest.TestCase): {}, ] - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = dfi_sequence, pad_checkers = {}, pad_generators = {}, @@ -710,7 +710,7 @@ class TestLPDDR4(unittest.TestCase): {}, ] - self.run_test(SimulationPHY(), + self.run_test(LPDDR4SimPHY(), dfi_sequence = dfi_sequence, pad_checkers = {}, pad_generators = { @@ -720,7 +720,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_cmd_write(self): # Test whole WRITE command sequence verifying data on pads and write_latency from MC perspective - phy = SimulationPHY() + phy = LPDDR4SimPHY() zero = '00000000' * 2 write_latency = phy.settings.write_latency wrphase = phy.settings.wrphase.reset.value @@ -770,7 +770,7 @@ class TestLPDDR4(unittest.TestCase): def test_lpddr4_cmd_read(self): # Test whole READ command sequence simulating DRAM response and verifying read_latency from MC perspective - phy = SimulationPHY() + phy = LPDDR4SimPHY() zero = '00000000' * 2 read_latency = phy.settings.read_latency rdphase = phy.settings.rdphase.reset.value