diff --git a/litedram/phy/ecp5ddrphy.py b/litedram/phy/ecp5ddrphy.py index 4454c4a..ad7eea6 100644 --- a/litedram/phy/ecp5ddrphy.py +++ b/litedram/phy/ecp5ddrphy.py @@ -130,7 +130,7 @@ class ECP5DDRPHY(Module, AutoCSR): wrcmdphase = wrcmdphase, cl = cl, cwl = cwl, - read_latency = 2 + cl_sys_latency + 2 + log2_int(4//nphases) + 4, + read_latency = 2 + cl_sys_latency + 2 + log2_int(4//nphases) + 5, write_latency = cwl_sys_latency ) @@ -140,7 +140,6 @@ class ECP5DDRPHY(Module, AutoCSR): # # # bl8_chunk = Signal() - rddata_en = Signal(self.settings.read_latency) # Iterate on pads groups ------------------------------------------------------------------- for pads_group in range(len(pads.groups)): @@ -202,10 +201,11 @@ class ECP5DDRPHY(Module, AutoCSR): ) # DQ --------------------------------------------------------------------------------------- - dq_oe = Signal() - dqs_oe = Signal() - dqs_pattern = DQSPattern() - self.submodules += dqs_pattern + dq_oe = Signal() + dqs_re = Signal() + dqs_oe = Signal() + dqs_postamble = Signal() + dqs_preamble = Signal() for i in range(databits//8): # DQSBUFM dqs_i = Signal() @@ -225,21 +225,6 @@ class ECP5DDRPHY(Module, AutoCSR): ) datavalid = Signal() burstdet = Signal() - dqs_read = Signal() - dqs_bitslip = Signal(2) - self.sync += [ - If(self._dly_sel.storage[i], - If(self._rdly_dq_bitslip_rst.re, - dqs_bitslip.eq(0) - ).Elif(self._rdly_dq_bitslip.re, - dqs_bitslip.eq(dqs_bitslip + 1) - ) - ) - ] - dqs_cases = {} - for j, b in enumerate(range(-2, 2)): - dqs_cases[j] = dqs_read.eq(rddata_en[cl_sys_latency + b:cl_sys_latency + b + 2] != 0) - self.sync += Case(dqs_bitslip, dqs_cases) self.specials += Instance("DQSBUFM", p_DQS_LI_DEL_ADJ = "MINUS", p_DQS_LI_DEL_VAL = 1, @@ -262,8 +247,8 @@ class ECP5DDRPHY(Module, AutoCSR): i_WRDIRECTION = 1, # Reads (generate shifted DQS clock for reads) - i_READ0 = dqs_read, - i_READ1 = dqs_read, + i_READ0 = dqs_re, + i_READ1 = dqs_re, i_READCLKSEL0 = rdly[0], i_READCLKSEL1 = rdly[1], i_READCLKSEL2 = rdly[2], @@ -308,7 +293,7 @@ class ECP5DDRPHY(Module, AutoCSR): dm_bl8_cases = {} dm_bl8_cases[0] = dm_o_data_muxed.eq(dm_o_data[:4]) dm_bl8_cases[1] = dm_o_data_muxed.eq(dm_o_data_d[4:]) - self.sync += Case(bl8_chunk, dm_bl8_cases) # FIXME: use self.comb? + self.sync += Case(bl8_chunk, dm_bl8_cases) self.specials += Instance("ODDRX2DQA", i_RST = ResetSignal("sys2x"), i_ECLK = ClockSignal("sys2x"), @@ -329,10 +314,10 @@ class ECP5DDRPHY(Module, AutoCSR): i_ECLK = ClockSignal("sys2x"), i_SCLK = ClockSignal(), i_DQSW = dqsw, - i_D0 = 0, # FIXME: dqs_pattern.o[3], - i_D1 = 1, # FIXME: dqs_pattern.o[2], - i_D2 = 0, # FIXME: dqs_pattern.o[1], - i_D3 = 1, # FIXME: dqs_pattern.o[0], + i_D0 = 0, + i_D1 = 1, + i_D2 = 0, + i_D3 = 1, o_Q = dqs ), Instance("TSHX2DQSA", @@ -340,8 +325,8 @@ class ECP5DDRPHY(Module, AutoCSR): i_ECLK = ClockSignal("sys2x"), i_SCLK = ClockSignal(), i_DQSW = dqsw, - i_T0 = ~(dqs_pattern.preamble | dqs_oe | dqs_pattern.postamble), - i_T1 = ~(dqs_pattern.preamble | dqs_oe | dqs_pattern.postamble), + i_T0 = ~(dqs_oe | dqs_postamble), + i_T1 = ~(dqs_oe | dqs_preamble), o_Q = dqs_oe_n ), Tristate(pads.dqs_p[i], dqs, ~dqs_oe_n, dqs_i) @@ -352,7 +337,7 @@ class ECP5DDRPHY(Module, AutoCSR): dq_i = Signal() dq_oe_n = Signal() dq_i_delayed = Signal() - dq_i_data = Signal(8) + dq_i_data = Signal(4) dq_o_data = Signal(8) dq_o_data_d = Signal(8) dq_o_data_muxed = Signal(4) @@ -371,8 +356,7 @@ class ECP5DDRPHY(Module, AutoCSR): dq_bl8_cases = {} dq_bl8_cases[0] = dq_o_data_muxed.eq(dq_o_data[:4]) dq_bl8_cases[1] = dq_o_data_muxed.eq(dq_o_data_d[4:]) - self.sync += Case(bl8_chunk, dq_bl8_cases) # FIXME: use self.comb? - _dq_i_data = Signal(4) + self.sync += Case(bl8_chunk, dq_bl8_cases) self.specials += [ Instance("ODDRX2DQA", i_RST = ResetSignal("sys2x"), @@ -405,23 +389,29 @@ class ECP5DDRPHY(Module, AutoCSR): i_WRPNTR1 = wrpntr[1], i_WRPNTR2 = wrpntr[2], i_D = dq_i_delayed, - o_Q0 = _dq_i_data[0], - o_Q1 = _dq_i_data[1], - o_Q2 = _dq_i_data[2], - o_Q3 = _dq_i_data[3], + o_Q0 = dq_i_data[0], + o_Q1 = dq_i_data[1], + o_Q2 = dq_i_data[2], + o_Q3 = dq_i_data[3], ) ] - self.sync += dq_i_data[:4].eq(dq_i_data[4:]) - self.sync += dq_i_data[4:].eq(_dq_i_data) + dq_i_bitslip = BitSlip(4, + rst = self._dly_sel.storage[i] & self._rdly_dq_bitslip_rst.re, + slp = self._dly_sel.storage[i] & self._rdly_dq_bitslip.re, + cycles = 1) + self.submodules += dq_i_bitslip + dq_i_bitslip_o_d = Signal(4) + self.comb += dq_i_bitslip.i.eq(dq_i_data) + self.sync += dq_i_bitslip_o_d.eq(dq_i_bitslip.o) self.comb += [ - dfi.phases[0].rddata[0*databits+j].eq(dq_i_data[0]), - dfi.phases[0].rddata[1*databits+j].eq(dq_i_data[1]), - dfi.phases[0].rddata[2*databits+j].eq(dq_i_data[2]), - dfi.phases[0].rddata[3*databits+j].eq(dq_i_data[3]), - dfi.phases[1].rddata[0*databits+j].eq(dq_i_data[4]), - dfi.phases[1].rddata[1*databits+j].eq(dq_i_data[5]), - dfi.phases[1].rddata[2*databits+j].eq(dq_i_data[6]), - dfi.phases[1].rddata[3*databits+j].eq(dq_i_data[7]), + dfi.phases[0].rddata[0*databits+j].eq(dq_i_bitslip_o_d[0]), + dfi.phases[0].rddata[1*databits+j].eq(dq_i_bitslip_o_d[1]), + dfi.phases[0].rddata[2*databits+j].eq(dq_i_bitslip_o_d[2]), + dfi.phases[0].rddata[3*databits+j].eq(dq_i_bitslip_o_d[3]), + dfi.phases[1].rddata[0*databits+j].eq(dq_i_bitslip.o[0]), + dfi.phases[1].rddata[1*databits+j].eq(dq_i_bitslip.o[1]), + dfi.phases[1].rddata[2*databits+j].eq(dq_i_bitslip.o[2]), + dfi.phases[1].rddata[3*databits+j].eq(dq_i_bitslip.o[3]), ] self.specials += [ Instance("TSHX2DQA", @@ -429,8 +419,8 @@ class ECP5DDRPHY(Module, AutoCSR): i_ECLK = ClockSignal("sys2x"), i_SCLK = ClockSignal(), i_DQSW270 = dqsw270, - i_T0 = ~(dqs_pattern.preamble | dq_oe | dqs_pattern.postamble), - i_T1 = ~(dqs_pattern.preamble | dq_oe | dqs_pattern.postamble), + i_T0 = ~dq_oe, + i_T1 = ~dq_oe, o_Q = dq_oe_n, ), Tristate(pads.dq[j], dq_o, ~dq_oe_n, dq_i) @@ -446,10 +436,13 @@ class ECP5DDRPHY(Module, AutoCSR): # # The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI # interface, the latency is the sum of the ODDRX2DQA, CAS, IDDRX2DQA latencies. + rddata_en = Signal(self.settings.read_latency) rddata_en_last = Signal.like(rddata_en) self.comb += rddata_en.eq(Cat(dfi.phases[self.settings.rdphase].rddata_en, rddata_en_last)) self.sync += rddata_en_last.eq(rddata_en) self.sync += [phase.rddata_valid.eq(rddata_en[-1]) for phase in dfi.phases] + self.comb += dqs_re.eq(rddata_en[cl_sys_latency + 1] | rddata_en[cl_sys_latency + 2]) + # Write Control Path ----------------------------------------------------------------------- # Creates a shift register of write commands coming from the DFI interface. This shift register @@ -457,12 +450,11 @@ class ECP5DDRPHY(Module, AutoCSR): # interface: The PHY is operating in halfrate mode (so provide 4 datas every sys_clk cycles: # 2x for DDR, 2x for halfrate) but DDR3 requires a burst of 8 datas (BL8) for best efficiency. # Writes are then performed in 2 sys_clk cycles and data needs to be selected for each cycle. - # FIXME: understand +2 - wrdata_en = Signal(cwl_sys_latency + 5) + wrdata_en = Signal(cwl_sys_latency + 4) wrdata_en_last = Signal.like(wrdata_en) self.comb += wrdata_en.eq(Cat(dfi.phases[self.settings.wrphase].wrdata_en, wrdata_en_last)) self.sync += wrdata_en_last.eq(wrdata_en) - self.comb += dq_oe.eq(wrdata_en[cwl_sys_latency + 2] | wrdata_en[cwl_sys_latency + 3]) + self.comb += dq_oe.eq(wrdata_en[cwl_sys_latency + 1] | wrdata_en[cwl_sys_latency + 2]) self.comb += bl8_chunk.eq(wrdata_en[cwl_sys_latency + 1]) self.comb += dqs_oe.eq(dq_oe) @@ -470,5 +462,5 @@ class ECP5DDRPHY(Module, AutoCSR): # Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last # write. During writes, DQS tristate is configured as output for at least 4 sys_clk cycles: # 1 for Preamble, 2 for the Write and 1 for the Postamble. - self.comb += dqs_pattern.preamble.eq( wrdata_en[cwl_sys_latency + 1] & ~wrdata_en[cwl_sys_latency + 2]) - self.comb += dqs_pattern.postamble.eq(wrdata_en[cwl_sys_latency + 4] & ~wrdata_en[cwl_sys_latency + 3]) + self.comb += dqs_preamble.eq( wrdata_en[cwl_sys_latency + 0] & ~wrdata_en[cwl_sys_latency + 1]) + self.comb += dqs_postamble.eq(wrdata_en[cwl_sys_latency + 3] & ~wrdata_en[cwl_sys_latency + 2])