diff --git a/bench/genesys2.py b/bench/genesys2.py index 3d5cb75..7d9a6ac 100755 --- a/bench/genesys2.py +++ b/bench/genesys2.py @@ -97,6 +97,15 @@ class BenchSoC(SoCCore): self.add_csr("ethphy") self.add_etherbone(phy=self.ethphy) + # Analyzer --------------------------------------------------------------------------------- + from litescope import LiteScopeAnalyzer + analyzer_signals = [self.ddrphy.dfi] + self.submodules.analyzer = LiteScopeAnalyzer(analyzer_signals, + depth = 512, + clock_domain = "sys", + csr_csv = "analyzer.csv") + self.add_csr("analyzer") + # Leds ------------------------------------------------------------------------------------- from litex.soc.cores.led import LedChaser self.submodules.leds = LedChaser( diff --git a/litedram/common.py b/litedram/common.py index 69eb1a3..10d8eeb 100644 --- a/litedram/common.py +++ b/litedram/common.py @@ -117,23 +117,24 @@ class PHYPadsCombiner: # BitSlip ------------------------------------------------------------------------------------------ class BitSlip(Module): - def __init__(self, dw, rst=None, slp=None, cycles=1): - self.i = Signal(dw) - self.o = Signal(dw) - self.rst = Signal() if rst is None else rst - self.slp = Signal() if slp is None else slp + def __init__(self, dw, i=None, o=None, rst=None, slp=None, cycles=1): + self.i = Signal(dw) if i is None else i + self.o = Signal(dw) if o is None else o + self.rst = Signal() if rst is None else rst + self.slp = Signal() if slp is None else slp + assert cycles >= 1 # # # - value = Signal(max=cycles*dw) + value = Signal(max=cycles*dw, reset=cycles*dw-1) self.sync += If(self.slp, value.eq(value + 1)) - self.sync += If(self.rst, value.eq(0)) + self.sync += If(self.rst, value.eq(value.reset)) r = Signal((cycles+1)*dw, reset_less=True) self.sync += r.eq(Cat(r[dw:], self.i)) cases = {} for i in range(cycles*dw): - cases[i] = self.o.eq(r[i:dw+i]) + cases[i] = self.o.eq(r[i+1:dw+i+1]) self.comb += Case(value, cases) # TappedDelayLine ---------------------------------------------------------------------------------- diff --git a/litedram/init.py b/litedram/init.py index 169a286..7fa6cb8 100644 --- a/litedram/init.py +++ b/litedram/init.py @@ -507,7 +507,10 @@ def get_sdram_phy_c_header(phy_settings, timing_settings): r += "#define SDRAM_PHY_WRITE_LEVELING_CAPABLE\n" if phytype in ["USDDRPHY", "USPDDRPHY"]: r += "#define SDRAM_PHY_WRITE_LEVELING_REINIT\n" - if phytype in ["USDDRPHY", "USPDDRPHY", "A7DDRPHY", "K7DDRPHY", "V7DDRPHY", "ECP5DDRPHY"]: + if phytype in ["USDDRPHY", "USPDDRPHY", "A7DDRPHY", "K7DDRPHY", "V7DDRPHY"]: + r += "#define SDRAM_PHY_WRITE_LATENCY_CALIBRATION_CAPABLE\n" + r += "#define SDRAM_PHY_READ_LEVELING_CAPABLE\n" + if phytype in ["ECP5DDRPHY"]: r += "#define SDRAM_PHY_READ_LEVELING_CAPABLE\n" # Define number of modules/delays/bitslips diff --git a/litedram/phy/ecp5ddrphy.py b/litedram/phy/ecp5ddrphy.py index 939e213..241ac0f 100644 --- a/litedram/phy/ecp5ddrphy.py +++ b/litedram/phy/ecp5ddrphy.py @@ -24,6 +24,30 @@ from litex.soc.interconnect.csr import * from litedram.common import * from litedram.phy.dfi import * +# BitSlip ------------------------------------------------------------------------------------------ + +# FIXME: Use BitSlip from litedram.common. + +class BitSlip(Module): + def __init__(self, dw, rst=None, slp=None, cycles=1): + self.i = Signal(dw) + self.o = Signal(dw) + self.rst = Signal() if rst is None else rst + self.slp = Signal() if slp is None else slp + + # # # + + value = Signal(max=cycles*dw) + self.sync += If(self.slp, value.eq(value + 1)) + self.sync += If(self.rst, value.eq(0)) + + r = Signal((cycles+1)*dw, reset_less=True) + self.sync += r.eq(Cat(r[dw:], self.i)) + cases = {} + for i in range(cycles*dw): + cases[i] = self.o.eq(r[i:dw+i]) + self.comb += Case(value, cases) + # Lattice ECP5 DDR PHY Initialization -------------------------------------------------------------- class ECP5DDRPHYInit(Module): @@ -108,6 +132,8 @@ class ECP5DDRPHY(Module, AutoCSR): cwl_sys_latency = get_sys_latency(nphases, cwl) # Registers -------------------------------------------------------------------------------- + self._rst = CSRStorage() + self._dly_sel = CSRStorage(databits//8) self._rdly_dq_rst = CSR() @@ -154,7 +180,7 @@ class ECP5DDRPHY(Module, AutoCSR): for i in range(len(pads.clk_p)): sd_clk_se = Signal() self.specials += Instance("ODDRX2F", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), **{f"i_D{n}": (0b1010 >> n) & 0b1 for n in range(4)}, @@ -177,7 +203,7 @@ class ECP5DDRPHY(Module, AutoCSR): pad = getattr(pads, pad_name) for i in range(len(pad)): self.specials += Instance("ODDRX2F", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), **{f"i_D{n}": getattr(dfi.phases[n//2], dfi_name)[i] for n in range(4)}, @@ -210,7 +236,7 @@ class ECP5DDRPHY(Module, AutoCSR): p_DQS_LO_DEL_ADJ = "MINUS", p_DQS_LO_DEL_VAL = 4, # Clocks / Reset - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DDRDEL = self.init.delay, @@ -252,7 +278,7 @@ class ECP5DDRPHY(Module, AutoCSR): dqs_oe_n = Signal() self.specials += [ Instance("ODDRX2DQSB", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSW = dqsw, @@ -260,7 +286,7 @@ class ECP5DDRPHY(Module, AutoCSR): o_Q = dqs ), Instance("TSHX2DQSA", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSW = dqsw, @@ -283,7 +309,7 @@ class ECP5DDRPHY(Module, AutoCSR): dm_bl8_cases[1] = dm_o_data_muxed.eq(dm_o_data_d[4:]) self.sync += Case(bl8_chunk, dm_bl8_cases) self.specials += Instance("ODDRX2DQA", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSW270 = dqsw270, @@ -310,7 +336,7 @@ class ECP5DDRPHY(Module, AutoCSR): self.sync += Case(bl8_chunk, dq_bl8_cases) self.specials += [ Instance("ODDRX2DQA", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSW270 = dqsw270, @@ -319,7 +345,7 @@ class ECP5DDRPHY(Module, AutoCSR): ) ] dq_i_bitslip = BitSlip(4, - rst = self._dly_sel.storage[i] & self._rdly_dq_bitslip_rst.re, + rst = (self._dly_sel.storage[i] & self._rdly_dq_bitslip_rst.re) | self._rst.storage, slp = self._dly_sel.storage[i] & self._rdly_dq_bitslip.re, cycles = 1) self.submodules += dq_i_bitslip @@ -333,7 +359,7 @@ class ECP5DDRPHY(Module, AutoCSR): o_Z = dq_i_delayed ), Instance("IDDRX2DQA", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSR90 = dqsr90, @@ -350,7 +376,7 @@ class ECP5DDRPHY(Module, AutoCSR): self.comb += dfi.phases[n//4].rddata[n%4*databits+j].eq(dq_i_data[n]) self.specials += [ Instance("TSHX2DQA", - i_RST = ResetSignal("sys"), + i_RST = ResetSignal("sys") | self._rst.storage, i_SCLK = ClockSignal("sys"), i_ECLK = ClockSignal("sys2x"), i_DQSW270 = dqsw270, diff --git a/litedram/phy/s7ddrphy.py b/litedram/phy/s7ddrphy.py index fd5075a..985bf52 100644 --- a/litedram/phy/s7ddrphy.py +++ b/litedram/phy/s7ddrphy.py @@ -29,7 +29,7 @@ class S7DDRPHY(Module, AutoCSR): nphases = 4, sys_clk_freq = 100e6, iodelay_clk_freq = 200e6, - cmd_latency = 0, + cmd_latency = 1, cmd_delay = None): assert not (memtype == "DDR3" and nphases == 2) phytype = self.__class__.__name__ @@ -82,6 +82,9 @@ class S7DDRPHY(Module, AutoCSR): self._wdly_dqs_rst = CSR() self._wdly_dqs_inc = CSR() + self._wdly_dq_bitslip_rst = CSR() + self._wdly_dq_bitslip = CSR() + self._rdphase = CSRStorage(int(math.log2(nphases)), reset=rdphase) self._wrphase = CSRStorage(int(math.log2(nphases)), reset=wrphase) @@ -98,7 +101,7 @@ class S7DDRPHY(Module, AutoCSR): cl = cl, cwl = cwl, read_latency = cl_sys_latency + 6, - write_latency = cwl_sys_latency, + write_latency = cwl_sys_latency - 1, cmd_latency = cmd_latency, cmd_delay = cmd_delay, ) @@ -208,8 +211,8 @@ class S7DDRPHY(Module, AutoCSR): dqs_postamble = Signal() dqs_oe_delay = TappedDelayLine(ntaps=2 if nphases == 4 else 1) dqs_pattern = DQSPattern( - preamble = dqs_preamble, - postamble = dqs_postamble, + #preamble = dqs_preamble, # FIXME + #postamble = dqs_postamble, # FIXME wlevel_en = self._wlevel_en.storage, wlevel_strobe = self._wlevel_strobe.re, register = not with_odelay) @@ -219,6 +222,12 @@ class S7DDRPHY(Module, AutoCSR): dqs_o_no_delay = Signal() dqs_o_delayed = Signal() dqs_t = Signal() + dqs_bitslip = BitSlip(8, + i = dqs_pattern.o, + rst = (self._dly_sel.storage[i] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dqs_bitslip self.specials += Instance("OSERDESE2", p_SERDES_MODE = "MASTER", p_DATA_WIDTH = 2*nphases, @@ -228,7 +237,7 @@ class S7DDRPHY(Module, AutoCSR): i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal(ddr_clk) if with_odelay else ClockSignal(ddr_clk+"_dqs"), i_CLKDIV = ClockSignal(), - **{f"i_D{n+1}": dqs_pattern.o[n] for n in range(8)}, + **{f"i_D{n+1}": dqs_bitslip.o[n] for n in range(8)}, i_OCE = 1, o_OFB = dqs_o_no_delay if with_odelay else Signal(), o_OQ = Signal() if with_odelay else dqs_o_no_delay, @@ -264,6 +273,12 @@ class S7DDRPHY(Module, AutoCSR): # DM --------------------------------------------------------------------------------------- for i in range(databits//8): dm_o_nodelay = Signal() + dm_o_bitslip = BitSlip(8, + i = Cat(*[dfi.phases[n//2].wrdata_mask[n%2*databits//8+i] for n in range(8)]), + rst = (self._dly_sel.storage[i] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dm_o_bitslip self.specials += Instance("OSERDESE2", p_SERDES_MODE = "MASTER", p_DATA_WIDTH = 2*nphases, @@ -273,7 +288,7 @@ class S7DDRPHY(Module, AutoCSR): i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal(ddr_clk), i_CLKDIV = ClockSignal(), - **{f"i_D{n+1}": dfi.phases[n//2].wrdata_mask[n%2*databits//8+i] for n in range(8)}, + **{f"i_D{n+1}": dm_o_bitslip.o[n] for n in range(8)}, i_OCE = 1, o_OQ = dm_o_nodelay if with_odelay else pads.dm[i], ) @@ -308,6 +323,12 @@ class S7DDRPHY(Module, AutoCSR): dq_i_delayed = Signal() dq_t = Signal() dq_i_data = Signal(8) + dq_o_bitslip = BitSlip(8, + i = Cat(*[dfi.phases[n//2].wrdata[n%2*databits+i] for n in range(8)]), + rst = (self._dly_sel.storage[i//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i//8] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dq_o_bitslip self.specials += Instance("OSERDESE2", p_SERDES_MODE = "MASTER", p_DATA_WIDTH = 2*nphases, @@ -317,7 +338,7 @@ class S7DDRPHY(Module, AutoCSR): i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal(ddr_clk), i_CLKDIV = ClockSignal(), - **{f"i_D{n+1}": dfi.phases[n//2].wrdata[n%2*databits+i] for n in range(8)}, + **{f"i_D{n+1}": dq_o_bitslip.o[n] for n in range(8)}, i_TCE = 1, i_T1 = ~dq_oe_delay.output, o_TQ = dq_t, @@ -325,7 +346,7 @@ class S7DDRPHY(Module, AutoCSR): o_OQ = dq_o_nodelay, ) dq_i_bitslip = BitSlip(8, - rst = self._dly_sel.storage[i//8] & self._rdly_dq_bitslip_rst.re, + rst = (self._dly_sel.storage[i//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage, slp = self._dly_sel.storage[i//8] & self._rdly_dq_bitslip.re, cycles = 1) self.submodules += dq_i_bitslip @@ -427,17 +448,17 @@ class S7DDRPHY(Module, AutoCSR): # Xilinx Virtex7 (S7DDRPHY with odelay) ------------------------------------------------------------ class V7DDRPHY(S7DDRPHY): - def __init__(self, pads, cmd_latency=1, **kwargs): + def __init__(self, pads, **kwargs): S7DDRPHY.__init__(self, pads, with_odelay=True, **kwargs) # Xilinx Kintex7 (S7DDRPHY with odelay) ------------------------------------------------------------ class K7DDRPHY(S7DDRPHY): - def __init__(self, pads, cmd_latency=1, **kwargs): - S7DDRPHY.__init__(self, pads, cmd_latency=cmd_latency, with_odelay=True, **kwargs) + def __init__(self, pads, **kwargs): + S7DDRPHY.__init__(self, pads, with_odelay=True, **kwargs) # Xilinx Artix7 (S7DDRPHY without odelay, sys2/4x_dqs generated in CRG with 90° phase vs sys2/4x) -- class A7DDRPHY(S7DDRPHY): - def __init__(self, pads, cmd_latency=0, **kwargs): - S7DDRPHY.__init__(self, pads, cmd_latency=0, with_odelay=False, **kwargs) + def __init__(self, pads, **kwargs): + S7DDRPHY.__init__(self, pads, with_odelay=False, cmd_latency=0, **kwargs) diff --git a/litedram/phy/usddrphy.py b/litedram/phy/usddrphy.py index e0ceecd..c401550 100644 --- a/litedram/phy/usddrphy.py +++ b/litedram/phy/usddrphy.py @@ -81,6 +81,9 @@ class USDDRPHY(Module, AutoCSR): self._wdly_dqs_rst = CSR() self._wdly_dqs_inc = CSR() + self._wdly_dq_bitslip_rst = CSR() + self._wdly_dq_bitslip = CSR() + self._rdphase = CSRStorage(2, reset=rdphase) self._wrphase = CSRStorage(2, reset=wrphase) @@ -97,7 +100,7 @@ class USDDRPHY(Module, AutoCSR): cl = cl, cwl = cwl, read_latency = cl_sys_latency + 5, - write_latency = cwl_sys_latency, + write_latency = cwl_sys_latency - 1, cmd_latency = cmd_latency, cmd_delay = cmd_delay, ) @@ -228,8 +231,8 @@ class USDDRPHY(Module, AutoCSR): dqs_postamble = Signal() dqs_oe_delay = TappedDelayLine(ntaps=1) dqs_pattern = DQSPattern( - preamble = dqs_preamble, - postamble = dqs_postamble, + #preamble = dqs_preamble, # FIXME + #postamble = dqs_postamble, # FIXME wlevel_en = self._wlevel_en.storage, wlevel_strobe = self._wlevel_strobe.re) self.submodules += dqs_oe_delay, dqs_pattern @@ -247,6 +250,12 @@ class USDDRPHY(Module, AutoCSR): dqs_taps_done.eq(1), self._half_sys8x_taps.status.eq(dqs_taps) ) + dqs_bitslip = BitSlip(8, + i = dqs_pattern.o, + rst = (self._dly_sel.storage[i] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dqs_bitslip if x4_dimm_mode: dqs_pads = ((pads.dqs_p[i*2], pads.dqs_n[i*2]), (pads.dqs_p[i*2 + 1], pads.dqs_n[i*2 + 1])) else: @@ -267,7 +276,7 @@ class USDDRPHY(Module, AutoCSR): i_CLK = ClockSignal("sys4x"), i_CLKDIV = ClockSignal(), i_T = ~dqs_oe_delay.output, - i_D = dqs_pattern.o, + i_D = dqs_bitslip.o, o_OQ = dqs_nodelay, o_T_OUT = dqs_t, @@ -303,6 +312,12 @@ class USDDRPHY(Module, AutoCSR): for i in range(databits//8): if hasattr(pads, "dm"): dm_o_nodelay = Signal() + dm_o_bitslip = BitSlip(8, + i = Cat(*[dfi.phases[n//2].wrdata_mask[n%2*databits//8+i] for n in range(8)]), + rst = (self._dly_sel.storage[i] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dm_o_bitslip self.specials += [ Instance("OSERDESE3", p_SIM_DEVICE = device, @@ -314,7 +329,7 @@ class USDDRPHY(Module, AutoCSR): i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal("sys4x"), i_CLKDIV = ClockSignal(), - i_D = Cat(*[dfi.phases[n//2].wrdata_mask[n%2*databits//8+i] for n in range(8)]), + i_D = dm_o_bitslip.o, o_OQ = dm_o_nodelay, ), Instance("ODELAYE3", @@ -348,6 +363,12 @@ class USDDRPHY(Module, AutoCSR): dq_i_nodelay = Signal() dq_i_delayed = Signal() dq_t = Signal() + dq_o_bitslip = BitSlip(8, + i = Cat(*[dfi.phases[n//2].wrdata[n%2*databits+i] for n in range(8)]), + rst = (self._dly_sel.storage[i//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage, + slp = self._dly_sel.storage[i//8] & self._wdly_dq_bitslip.re, + cycles = 1) + self.submodules += dq_o_bitslip self.specials += Instance("OSERDESE3", p_SIM_DEVICE = device, p_DATA_WIDTH = 8, @@ -358,13 +379,13 @@ class USDDRPHY(Module, AutoCSR): i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal("sys4x"), i_CLKDIV = ClockSignal(), - i_D = Cat(*[dfi.phases[n//2].wrdata[n%2*databits+i] for n in range(8)]), + i_D = dq_o_bitslip.o, i_T = ~dq_oe_delay.output, o_OQ = dq_o_nodelay, o_T_OUT = dq_t, ) dq_i_bitslip = BitSlip(8, - rst = self._dly_sel.storage[i//8] & self._rdly_dq_bitslip_rst.re, + rst = (self._dly_sel.storage[i//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage, slp = self._dly_sel.storage[i//8] & self._rdly_dq_bitslip.re, cycles = 1) self.submodules += dq_i_bitslip @@ -465,5 +486,5 @@ class USDDRPHY(Module, AutoCSR): # Xilinx Ultrascale Plus DDR3/DDR4 PHY ------------------------------------------------------------- class USPDDRPHY(USDDRPHY): - def __init__(self, pads, cmd_latency=1, **kwargs): - USDDRPHY.__init__(self, pads, cmd_latency=cmd_latency, **kwargs) + def __init__(self, pads, **kwargs): + USDDRPHY.__init__(self, pads, **kwargs) diff --git a/test/reference/ddr3_init.h b/test/reference/ddr3_init.h index d7e47c5..f399bed 100644 --- a/test/reference/ddr3_init.h +++ b/test/reference/ddr3_init.h @@ -25,6 +25,7 @@ #define SDRAM_PHY_RDPHASE 0 #define SDRAM_PHY_WRPHASE 1 #define SDRAM_PHY_WRITE_LEVELING_CAPABLE +#define SDRAM_PHY_WRITE_LATENCY_CALIBRATION_CAPABLE #define SDRAM_PHY_READ_LEVELING_CAPABLE #define SDRAM_PHY_MODULES DFII_PIX_DATA_BYTES/2 #define SDRAM_PHY_DELAYS 32 diff --git a/test/reference/ddr4_init.h b/test/reference/ddr4_init.h index 4691ac3..6abf3c9 100644 --- a/test/reference/ddr4_init.h +++ b/test/reference/ddr4_init.h @@ -26,6 +26,7 @@ #define SDRAM_PHY_WRPHASE 2 #define SDRAM_PHY_WRITE_LEVELING_CAPABLE #define SDRAM_PHY_WRITE_LEVELING_REINIT +#define SDRAM_PHY_WRITE_LATENCY_CALIBRATION_CAPABLE #define SDRAM_PHY_READ_LEVELING_CAPABLE #define SDRAM_PHY_MODULES DFII_PIX_DATA_BYTES/2 #define SDRAM_PHY_DELAYS 512