From 4a5feb9e11c84e00cade0e9c29aa85b1c340762f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C4=99drzej=20Boczar?= Date: Tue, 9 Feb 2021 14:37:28 +0100 Subject: [PATCH] lpddr4/s7phy: improve to use the new DoubleRateLPDDR4PHY --- litedram/phy/lpddr4/basephy.py | 14 +- litedram/phy/lpddr4/s7phy.py | 241 ++++++++++++++++++++------------- 2 files changed, 152 insertions(+), 103 deletions(-) diff --git a/litedram/phy/lpddr4/basephy.py b/litedram/phy/lpddr4/basephy.py index 3fd2f57..539eda0 100644 --- a/litedram/phy/lpddr4/basephy.py +++ b/litedram/phy/lpddr4/basephy.py @@ -230,7 +230,7 @@ class LPDDR4PHY(Module, AutoCSR): dw = 2*nphases, cycles = bitslip_cycles, rst = self.get_rst(bit//8, self._wdly_dq_bitslip_rst), - slp = self.get_slp(bit//8, self._wdly_dq_bitslip), + slp = self.get_inc(bit//8, self._wdly_dq_bitslip), i = Cat(*wrdata), o = self.out.dq_o[bit], ) @@ -241,7 +241,7 @@ class LPDDR4PHY(Module, AutoCSR): dw = 2*nphases, cycles = bitslip_cycles, rst = self.get_rst(bit//8, self._rdly_dq_bitslip_rst), - slp = self.get_slp(bit//8, self._rdly_dq_bitslip), + slp = self.get_inc(bit//8, self._rdly_dq_bitslip), i = self.out.dq_i[bit], o = dq_i_bs, ) @@ -268,7 +268,7 @@ class LPDDR4PHY(Module, AutoCSR): dw = 2*nphases, cycles = bitslip_cycles, rst = self.get_rst(byte, self._wdly_dq_bitslip_rst), - slp = self.get_slp(byte, self._wdly_dq_bitslip), + slp = self.get_inc(byte, self._wdly_dq_bitslip), i = dqs_pattern.o, o = self.out.dqs_o[byte], ) @@ -292,7 +292,7 @@ class LPDDR4PHY(Module, AutoCSR): dw = 2*nphases, cycles = bitslip_cycles, rst = self.get_rst(byte, self._wdly_dq_bitslip_rst), - slp = self.get_slp(byte, self._wdly_dq_bitslip), + slp = self.get_inc(byte, self._wdly_dq_bitslip), i = Cat(*wrdata_mask), o = self.out.dmi_o[byte], ) @@ -340,10 +340,12 @@ class LPDDR4PHY(Module, AutoCSR): self.comb += dqs_postamble.eq(wrdata_en_tap(wrtap + 1) & ~wrdata_en_tap(wrtap + 0)) def get_rst(self, byte, rst_csr): + assert isinstance(rst_csr, CSR) and rst_csr.name.endswith("rst"), rst_csr return (self._dly_sel.storage[byte] & rst_csr.re) | self._rst.storage - def get_slp(self, byte, slp_csr): - return self._dly_sel.storage[byte] & slp_csr.re + def get_inc(self, byte, inc_csr): + assert isinstance(inc_csr, CSR) and not inc_csr.name.endswith("rst"), inc_csr + return self._dly_sel.storage[byte] & inc_csr.re class DoubleRateLPDDR4PHY(LPDDR4PHY): diff --git a/litedram/phy/lpddr4/s7phy.py b/litedram/phy/lpddr4/s7phy.py index e651c36..a46bb3e 100644 --- a/litedram/phy/lpddr4/s7phy.py +++ b/litedram/phy/lpddr4/s7phy.py @@ -5,125 +5,168 @@ from litex.soc.interconnect.csr import * from litedram.common import * from litedram.phy.dfi import * -from litedram.phy.lpddr4.utils import delayed -from litedram.phy.lpddr4.basephy import LPDDR4PHY +from litedram.phy.lpddr4.utils import delayed as delayed +from litedram.phy.lpddr4.basephy import DoubleRateLPDDR4PHY, Latency -# TODO: add option to avoid ODELAYE2, for now it won't work on Artix7 -class S7LPDDR4PHY(LPDDR4PHY): +class S7LPDDR4PHY(DoubleRateLPDDR4PHY): def __init__(self, pads, *, iodelay_clk_freq, **kwargs): + # TODO: add `with_odelay` argument to avoid ODELAYE2, currently it won't work on Artix7 self.iodelay_clk_freq = iodelay_clk_freq + # DoubleRateLPDDR4PHY outputs half-width signals (comparing to LPDDR4PHY) in sys2x domain. + # This allows us to use 8:1 DDR OSERDESE2/ISERDESE2 to (de-)serialize the data. + _sys2x = 4 super().__init__(pads, - # TODO: verify - write_ser_latency = 1, # OSERDESE2 8:1 DDR (4 full-rate clocks) - read_des_latency = 2, # ISERDESE2 NETWORKING - phytype = self.__class__.__name__, + ser_latency = Latency(sys=0, sys8x=1*_sys2x), # OSERDESE2 8:1 DDR (4 full-rate clocks) + des_latency = Latency(sys=2, sys8x=2*_sys2x), # ISERDESE2 NETWORKING + phytype = self.__class__.__name__, **kwargs ) + self.submodules.sys2x_delay = ClockDomainsRenamer("sys2x")(Module()) + # Parameters ------------------------------------------------------------------------------- - iodelay_tap_average = { - 200e6: 78e-12, - 300e6: 52e-12, - 400e6: 39e-12, # Only valid for -3 and -2/2E speed grades - } - half_sys8x_taps = math.floor(self.tck/(4*iodelay_tap_average[iodelay_clk_freq])) + # Calculate value of taps needed to shift a signal by 90 degrees. + # Using iodelay_clk_freq of 300MHz/400MHz is only valid for -3 and -2/2E speed grades. + # FIXME: this should be named sys16x, but using sys8x due to a name hard-coded in BIOS + assert iodelay_clk_freq in [200e6, 300e6, 400e6] + iodelay_tap_average = 1 / (2*32 * iodelay_clk_freq) + half_sys8x_taps = math.floor(self.tck / (4 * iodelay_tap_average)) # Registers -------------------------------------------------------------------------------- self._half_sys8x_taps = CSRStorage(5, reset=half_sys8x_taps) # odelay control - self._cdly_rst = CSR() - self._cdly_inc = CSR() - self._rdly_dq_rst = CSR() - self._rdly_dq_inc = CSR() - self._wdly_dq_rst = CSR() - self._wdly_dq_inc = CSR() - self._wdly_dqs_rst = CSR() - self._wdly_dqs_inc = CSR() + self._cdly_rst = CSR() + self._cdly_inc = CSR() + self._rdly_dq_rst = CSR() + self._rdly_dq_inc = CSR() + self._wdly_dq_rst = CSR() + self._wdly_dq_inc = CSR() + self._wdly_dqs_rst = CSR() + self._wdly_dqs_inc = CSR() cdly_rst = self._cdly_rst.re | self._rst.storage cdly_inc = self._cdly_inc.re # Serialization ---------------------------------------------------------------------------- - # TODO: need to implement half-serialization from sys (16 bits) to sys2x (8 bits) before oserdese # Clock clk_ser = Signal() clk_dly = Signal() - self.oserdese2_ddr(din=self.ck_clk, dout=clk_ser, clk="sys8x") + self.oserdese2_ddr(din=self.out.clk, dout=clk_ser, clk="sys8x") self.odelaye2(din=clk_ser, dout=clk_dly, rst=cdly_rst, inc=cdly_inc) self.obufds(din=clk_dly, dout=self.pads.clk_p, dout_b=self.pads.clk_n) - # probably no need for oserdese + # FIXME: probably no need to serialize those for cmd in ["cke", "odt", "reset_n"]: cmd_ser = Signal() - self.oserdese2_ddr(din=getattr(self, f"ck_{cmd}"), dout=cmd_ser, clk="sys8x") + self.oserdese2_sdr(din=getattr(self.out, cmd), dout=cmd_ser, clk="sys8x") self.odelaye2(din=cmd_ser, dout=getattr(self.pads, cmd), rst=cdly_rst, inc=cdly_inc) # Commands cs_ser = Signal() - self.oserdese2_ddr(din=self.ck_cs, dout=cs_ser, clk="sys8x") + self.oserdese2_sdr(din=self.out.cs, dout=cs_ser, clk="sys8x") self.odelaye2(din=cs_ser, dout=self.pads.cs, rst=cdly_rst, inc=cdly_inc) - for i in range(6): + for bit in range(6): ca_ser = Signal() - self.oserdese2_ddr(din=self.ck_ca[i], dout=ca_ser, clk="sys8x") - self.odelaye2(din=ca_ser, dout=self.pads.ca[i], rst=cdly_rst, inc=cdly_inc) + self.oserdese2_sdr(din=self.out.ca[bit], dout=ca_ser, clk="sys8x") + self.odelaye2(din=ca_ser, dout=self.pads.ca[bit], rst=cdly_rst, inc=cdly_inc) # DQS - for i in range(self.databits//8): + for byte in range(self.databits//8): # DQS - dqs_t = Signal() + dqs_t = Signal() dqs_ser = Signal() dqs_dly = Signal() - rst = (self._dly_sel.storage[i] & self._wdly_dqs_rst.re) | self._rst.storage - inc = self._dly_sel.storage[i] & self._wdly_dqs_inc.re self.oserdese2_ddr( - din=self.ck_dqs_o[i], dout=dqs_ser, - tin=~self.dqs_oe, tout=dqs_t, - clk="sys8x") - self.odelaye2(din=dqs_ser, dout=dqs_dly, rst=rst, inc=inc) + din = self.out.dqs_o[byte], + dout_fb = dqs_ser, + tin = ~self.out.dqs_oe, + tout = dqs_t, + clk = "sys8x", # TODO: if odelay is not avaiable need to use sys8x_90 + ) + self.odelaye2( + din = dqs_ser, + dout = dqs_dly, + rst = self.get_rst(byte, self._wdly_dqs_rst), + inc = self.get_inc(byte, self._wdly_dqs_inc), + init = half_sys8x_taps, # shifts by 90 degrees + ) self.iobufds( - din=dqs_dly, dout=Signal(), - dinout=self.pads.dqs_p[i], dinout_b=self.pads.dqs_n[i], - tin=dqs_t) + din = dqs_dly, + dout = Signal(), # TODO: DQS input path + tin = dqs_t, + dinout = self.pads.dqs_p[byte], + dinout_b = self.pads.dqs_n[byte], + ) # DMI - for i in range(self.databits//8): - dmi_t = Signal() + for byte in range(self.databits//8): + dmi_t = Signal() dmi_ser = Signal() dmi_dly = Signal() - rst = (self._dly_sel.storage[i] & self._wdly_dq_rst.re) | self._rst.storage - inc = self._dly_sel.storage[i] & self._wdly_dq_inc.re self.oserdese2_ddr( - din=self.ck_dmi_o[i], dout=dmi_ser, - tin=~self.dmi_oe, tout=dmi_t, - clk="sys8x") - self.odelaye2(din=dmi_ser, dout=dmi_dly, rst=rst, inc=inc) - self.iobuf(din=dmi_dly, dout=Signal(), dinout=self.pads.dmi[i], tin=dmi_t) + din = self.out.dmi_o[byte], + dout_fb = dmi_ser, + tin = ~self.out.dmi_oe, + tout = dmi_t, + clk = "sys8x", + ) + self.odelaye2( + din = dmi_ser, + dout = dmi_dly, + rst = self.get_rst(byte, self._wdly_dq_rst), + inc = self.get_inc(byte, self._wdly_dq_inc), + ) + self.iobuf( + din = dmi_dly, + dout = Signal(), + tin = dmi_t, + dinout = self.pads.dmi[byte], + ) # DQ - for i in range(self.databits): - dq_t = Signal() - dq_ser = Signal() - dq_dly = Signal() - dq_i = Signal() + for bit in range(self.databits): + dq_t = Signal() + dq_ser = Signal() + dq_dly = Signal() + dq_i = Signal() dq_i_dly = Signal() - - rst_w = (self._dly_sel.storage[i//8] & self._wdly_dq_rst.re) | self._rst.storage - inc_w = self._dly_sel.storage[i//8] & self._wdly_dq_inc.re - rst_r = (self._dly_sel.storage[i//8] & self._rdly_dq_rst.re) | self._rst.storage - inc_r = self._dly_sel.storage[i//8] & self._rdly_dq_inc.re - self.oserdese2_ddr( - din=self.ck_dq_o[i], dout=dq_ser, - tin=~self.dq_oe, tout=dq_t, - clk="sys8x") - self.odelaye2(din=dq_ser, dout=dq_dly, rst=rst_w, inc=inc_w) - self.iobuf(din=dq_dly, dout=dq_i, dinout=self.pads.dq[i], tin=dq_t) - self.idelaye2(din=dq_i, dout=dq_i_dly, rst=rst_r, inc=inc_r) - self.iserdese2_ddr(din=dq_i_dly, dout=self.ck_dq_i[i], clk="sys8x") + din = self.out.dq_o[bit], + dout_fb = dq_ser, # TODO: compare: S7DDRPHY uses OQ not OFB + tin = ~self.out.dq_oe, + tout = dq_t, + clk = "sys8x", + ) + self.odelaye2( + din = dq_ser, + dout = dq_dly, + rst = self.get_rst(bit//8, self._wdly_dq_rst), + inc = self.get_inc(bit//8, self._wdly_dq_inc), + ) + self.iobuf( + din = dq_dly, + dout = dq_i, + dinout = self.pads.dq[bit], + tin = dq_t + ) + self.idelaye2( + din = dq_i, + dout = dq_i_dly, + rst = self.get_rst(bit//8, self._rdly_dq_rst), + inc = self.get_inc(bit//8, self._rdly_dq_inc) + ) + self.iserdese2_ddr( + din = dq_i_dly, + dout = self.out.dq_i[bit], + clk = "sys8x" + ) + + def delayed_sys2x(self, sig, **kwargs): + return delayed(self.sys2x_delay, sig, **kwargs) def idelaye2(self, *, din, dout, init=0, rst=None, inc=None): assert not ((rst is None) ^ (inc is None)) @@ -154,9 +197,10 @@ class S7LPDDR4PHY(LPDDR4PHY): self.specials += Instance("IDELAYE2", **params) - def odelaye2(self, *, din, dout, init=0, rst=None, inc=None): # Not available for Artix7 + def odelaye2(self, *, din, dout, clk=None, init=0, rst=None, inc=None): # Not available for Artix7 assert not ((rst is None) ^ (inc is None)) fixed = rst is not None + assert clk is not None or fixed params = dict( p_SIGNAL_PATTERN = "DATA", @@ -174,7 +218,7 @@ class S7LPDDR4PHY(LPDDR4PHY): if not fixed: params.update(dict( p_ODELAY_TYPE = "VARIABLE", - i_C = ClockSignal(), + i_C = ClockSignal(clk), i_LD = rst, i_CE = inc, i_LDPIPEEN = 0, @@ -183,66 +227,69 @@ class S7LPDDR4PHY(LPDDR4PHY): self.specials += Instance("ODELAYE2", **params) - def oserdese2_ddr(self, *, din, dout, clk, tin=None, tout=None): - # FIXME: must implement 1 step of serialization manually (16bit -> 8bit) - # assert self.nphases == 4 - nphases = 4 - assert not ((tin is None) ^ (tout is None)) + def oserdese2_ddr(self, *, din, clk, dout=None, dout_fb=None, tin=None, tout=None): + data_width = len(din) + assert data_width == 8, (data_width, din) + assert not ((tin is None) ^ (tout is None)), "When using tristate specify both `tin` and `tout`" + assert not ((dout is None) and (dout_fb is None)), "Output to OQ (-> IOB) and/or to OFB (-> ISERDESE2/ODELAYE2)" + + dout = Signal() if dout is None else dout + dout_fb = Signal() if dout_fb is None else dout_fb params = dict( p_SERDES_MODE = "MASTER", - p_DATA_WIDTH = 2*nphases, + p_DATA_WIDTH = data_width, p_TRISTATE_WIDTH = 1, p_DATA_RATE_OQ = "DDR", p_DATA_RATE_TQ = "BUF", - i_RST = ResetSignal(), + i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal(clk), - i_CLKDIV = ClockSignal("sys"), + i_CLKDIV = ClockSignal("sys2x"), o_OQ = dout, + o_OFB = dout_fb, i_OCE = 1, ) - for i in range(2*nphases): - params["i_D{}".format(i+1)] = din[i] + for i in range(data_width): + params[f"i_D{i+1}"] = din[i] if tin is not None: # with DATA_RATE_TQ=BUF tristate is asynchronous, so we need to delay it - tin_d = Signal() - self.sync += tin_d.eq(tin) - - # register it on the CLKDIV (as it would be too short for 180 deg shifted clk) - tin_cdc = Signal() - sd_clkdiv = getattr(self.sync, clk) - sd_clkdiv += tin_cdc.eq(tin_d) - - params.update(dict(i_TCE=1, i_T1=tin_cdc, o_TQ=tout)) + params.update(dict(i_TCE=1, i_T1=self.delayed_sys2x(tin), o_TQ=tout)) self.specials += Instance("OSERDESE2", **params) + def oserdese2_sdr(self, **kwargs): + # Use 8:1 OSERDESE2 DDR instead of 4:1 OSERDESE2 SDR to have the same latency + din = kwargs["din"] + din_ddr = Signal(2*len(din)) + kwargs["din"] = din_ddr + self.comb += din_ddr.eq(Cat(*[Replicate(bit, 2) for bit in din])) + self.oserdese2_ddr(**kwargs) + def iserdese2_ddr(self, *, din, dout, clk): - # FIXME: must implement 1 step of serialization manually (16bit -> 8bit) - # assert self.nphases == 4 - nphases = 4 + data_width = len(dout) + assert data_width == 8, (data_width, dout) params = dict( p_SERDES_MODE = "MASTER", p_INTERFACE_TYPE = "NETWORKING", # TODO: try using MEMORY mode? - p_DATA_WIDTH = 2*nphases, + p_DATA_WIDTH = data_width, p_DATA_RATE = "DDR", p_NUM_CE = 1, p_IOBDELAY = "IFD", - i_RST = ResetSignal(), + i_RST = ResetSignal() | self._rst.storage, i_CLK = ClockSignal(clk), i_CLKB = ~ClockSignal(clk), - i_CLKDIV = ClockSignal("sys"), + i_CLKDIV = ClockSignal("sys2x"), i_BITSLIP = 0, i_CE1 = 1, i_DDLY = din, ) - for i in range(2*nphases): + for i in range(data_width): # invert order - params["o_Q{}".format(i+1)] = dout[(2*nphases - 1) - i] + params[f"o_Q{i+1}"] = dout[(data_width - 1) - i] self.specials += Instance("ISERDESE2", **params)