lpddr4: split implementation into multiple files in separate directory

This commit is contained in:
Jędrzej Boczar 2021-01-19 14:48:17 +01:00
parent ee9c2b4cf7
commit 05ed238829
9 changed files with 751 additions and 714 deletions

View File

@ -554,9 +554,9 @@ def get_lpddr4_phy_init_sequence(phy_settings, timing_settings):
ba = 0
return ("Load More Register {}".format(ma), a, ba, cmds["MODE_REGISTER"], 200)
from litedram.phy.lpddr4phy import DFIPhaseAdapter
zqc_start = DFIPhaseAdapter.MPC["ZQC-START"]
zqc_latch = DFIPhaseAdapter.MPC["ZQC-LATCH"]
from litedram.phy.lpddr4.commands import MPC
zqc_start = MPC["ZQC-START"]
zqc_latch = MPC["ZQC-LATCH"]
init_sequence = [
("Release reset", 0x0000, 0, cmds["UNRESET"], 50000),

View File

@ -0,0 +1,2 @@
from litedram.phy.lpddr4.s7phy import S7LPDDR4PHY
from litedram.phy.lpddr4.simphy import LPDDR4SimPHY

View File

@ -0,0 +1,312 @@
from operator import or_
from functools import reduce
from collections import defaultdict
from migen import *
from litex.soc.interconnect.csr import *
from litedram.common import *
from litedram.phy.dfi import *
from litedram.phy.lpddr4.utils import bitpattern, delayed, ConstBitSlip, DQSPattern
from litedram.phy.lpddr4.commands import DFIPhaseAdapter
class LPDDR4PHY(Module, AutoCSR):
def __init__(self, pads, *,
sys_clk_freq, write_ser_latency, read_des_latency, phytype, cmd_delay=None):
self.pads = pads
self.memtype = memtype = "LPDDR4"
self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len(pads.cs_n)
self.databits = databits = len(pads.dq)
self.addressbits = addressbits = 17 # for activate row address
self.bankbits = bankbits = 3
self.nphases = nphases = 8
self.tck = tck = 1 / (nphases*sys_clk_freq)
assert databits % 8 == 0
# Parameters -------------------------------------------------------------------------------
def get_cl_cw(memtype, tck):
# MT53E256M16D1, No DBI, Set A
f_to_cl_cwl = OrderedDict()
f_to_cl_cwl[ 532e6] = ( 6, 4) # FIXME: with that low cwl, wrtap is 0
f_to_cl_cwl[1066e6] = (10, 6)
f_to_cl_cwl[1600e6] = (14, 8)
f_to_cl_cwl[2132e6] = (20, 10)
f_to_cl_cwl[2666e6] = (24, 12)
f_to_cl_cwl[3200e6] = (28, 14)
f_to_cl_cwl[3732e6] = (32, 16)
f_to_cl_cwl[4266e6] = (36, 18)
for f, (cl, cwl) in f_to_cl_cwl.items():
if tck >= 2/f:
return cl, cwl
raise ValueError
# Bitslip introduces latency between from `cycles` up to `cycles + 1`
bitslip_cycles = 1
# Commands are sent over 4 cycles of DRAM clock (sys8x)
cmd_latency = 4
# Commands read from adapters are delayed on ConstBitSlips
ca_latency = 1
cl, cwl = get_cl_cw(memtype, tck)
cl_sys_latency = get_sys_latency(nphases, cl)
cwl_sys_latency = get_sys_latency(nphases, cwl)
rdphase = get_sys_phase(nphases, cl_sys_latency, cl + cmd_latency)
wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency)
# When the calculated phase is negative, it means that we need to increase sys latency
def updated_latency(phase):
delay_update = 0
while phase < 0:
phase += nphases
delay_update += 1
return phase, delay_update
wrphase, cwl_sys_delay = updated_latency(wrphase)
rdphase, cl_sys_delay = updated_latency(rdphase)
cwl_sys_latency += cwl_sys_delay
cl_sys_latency += cl_sys_delay
# Read latency
read_data_delay = ca_latency + write_ser_latency + cl_sys_latency # DFI cmd -> read data on DQ
read_des_delay = read_des_latency + bitslip_cycles # data on DQ -> data on DFI rddata
read_latency = read_data_delay + read_des_delay
# Write latency
write_latency = cwl_sys_latency
# FIXME: remove
if __import__("os").environ.get("DEBUG") == '1':
print('cl', end=' = '); __import__('pprint').pprint(cl)
print('cwl', end=' = '); __import__('pprint').pprint(cwl)
print('cl_sys_latency', end=' = '); __import__('pprint').pprint(cl_sys_latency)
print('cwl_sys_latency', end=' = '); __import__('pprint').pprint(cwl_sys_latency)
print('rdphase', end=' = '); __import__('pprint').pprint(rdphase)
print('wrphase', end=' = '); __import__('pprint').pprint(wrphase)
print('read_data_delay', end=' = '); __import__('pprint').pprint(read_data_delay)
print('read_des_delay', end=' = '); __import__('pprint').pprint(read_des_delay)
print('read_latency', end=' = '); __import__('pprint').pprint(read_latency)
print('write_latency', end=' = '); __import__('pprint').pprint(write_latency)
# Registers --------------------------------------------------------------------------------
self._rst = CSRStorage()
self._dly_sel = CSRStorage(databits//8)
self._wlevel_en = CSRStorage()
self._wlevel_strobe = CSR()
self._dly_sel = CSRStorage(databits//8)
self._rdly_dq_bitslip_rst = CSR()
self._rdly_dq_bitslip = CSR()
self._wdly_dq_bitslip_rst = CSR()
self._wdly_dq_bitslip = CSR()
self._rdphase = CSRStorage(log2_int(nphases), reset=rdphase)
self._wrphase = CSRStorage(log2_int(nphases), reset=wrphase)
# PHY settings -----------------------------------------------------------------------------
self.settings = PhySettings(
phytype = phytype,
memtype = memtype,
databits = databits,
dfi_databits = 2*databits,
nranks = nranks,
nphases = nphases,
rdphase = self._rdphase.storage,
wrphase = self._wrphase.storage,
cl = cl,
cwl = cwl,
read_latency = read_latency,
write_latency = write_latency,
cmd_latency = cmd_latency,
cmd_delay = cmd_delay,
)
# DFI Interface ----------------------------------------------------------------------------
# Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a
# whole burst during a single controller clock cycle. PHY should use sys8x clock.
self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2*databits, nphases=8)
# # #
adapters = [DFIPhaseAdapter(phase) for phase in self.dfi.phases]
self.submodules += adapters
# Now prepare the data by converting the sequences on adapters into sequences on the pads.
# We have to ignore overlapping commands, and module timings have to ensure that there are
# no overlapping commands anyway.
# Pads: reset_n, CS, CKE, CK, CA[5:0], DMI[1:0], DQ[15:0], DQS[1:0], ODT_CA
self.ck_clk = Signal(2*nphases)
self.ck_cke = Signal(nphases)
self.ck_odt = Signal(nphases)
self.ck_reset_n = Signal(nphases)
self.ck_cs = Signal(nphases)
self.ck_ca = [Signal(nphases) for _ in range(6)]
self.ck_dmi_o = [Signal(2*nphases) for _ in range(2)]
self.ck_dmi_i = [Signal(2*nphases) for _ in range(2)]
self.dmi_oe = Signal()
self.ck_dq_o = [Signal(2*nphases) for _ in range(databits)]
self.ck_dq_i = [Signal(2*nphases) for _ in range(databits)]
self.dq_oe = Signal()
self.ck_dqs_o = [Signal(2*nphases) for _ in range(2)]
self.ck_dqs_i = [Signal(2*nphases) for _ in range(2)]
self.dqs_oe = Signal()
# Clocks -----------------------------------------------------------------------------------
self.comb += self.ck_clk.eq(bitpattern("-_-_-_-_" * 2))
# Simple commands --------------------------------------------------------------------------
self.comb += [
self.ck_cke.eq(Cat(delayed(self, phase.cke) for phase in self.dfi.phases)),
self.ck_odt.eq(Cat(delayed(self, phase.odt) for phase in self.dfi.phases)),
self.ck_reset_n.eq(Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)),
]
# LPDDR4 Commands --------------------------------------------------------------------------
# Each command can span several phases (up to 4), so we must ignore overlapping commands,
# but in general, module timings should be set in a way that overlapping will never happen.
# Create a history of valid adapters used for masking overlapping ones.
# TODO: make optional, as it takes up resources and the controller should ensure no overlaps
valids = ConstBitSlip(dw=nphases, cycles=1, slp=0)
self.submodules += valids
self.comb += valids.i.eq(Cat(a.valid for a in adapters))
# valids_hist = valids.r
valids_hist = Signal.like(valids.r)
# TODO: especially make this part optional
for i in range(len(valids_hist)):
was_valid_before = reduce(or_, valids_hist[max(0, i-3):i], 0)
self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before)
cs_per_adapter = []
ca_per_adapter = defaultdict(list)
for phase, adapter in enumerate(adapters):
# The signals from an adapter can be used if there were no commands on 3 previous cycles
allowed = ~reduce(or_, valids_hist[nphases+phase - 3:nphases+phase])
# Use CS and CA of given adapter slipped by `phase` bits
cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
self.submodules += cs_bs
self.comb += cs_bs.i.eq(Cat(adapter.cs)),
cs_mask = Replicate(allowed, len(cs_bs.o))
cs = cs_bs.o & cs_mask
cs_per_adapter.append(cs)
# For CA we need to do the same for each bit
ca_bits = []
for bit in range(6):
ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
self.submodules += ca_bs
ca_bit_hist = [adapter.ca[i][bit] for i in range(4)]
self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)),
ca_mask = Replicate(allowed, len(ca_bs.o))
ca = ca_bs.o & ca_mask
ca_per_adapter[bit].append(ca)
# OR all the masked signals
self.comb += self.ck_cs.eq(reduce(or_, cs_per_adapter))
for bit in range(6):
self.comb += self.ck_ca[bit].eq(reduce(or_, ca_per_adapter[bit]))
# DQ ---------------------------------------------------------------------------------------
dq_oe = Signal()
self.comb += self.dq_oe.eq(delayed(self, dq_oe, cycles=1))
for bit in range(self.databits):
# output
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re,
i = Cat(*[self.dfi.phases[i//2].wrdata[i%2 * self.databits + bit] for i in range(2*nphases)]),
o = self.ck_dq_o[bit],
)
# input
dq_i_bs = Signal(2*nphases)
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip.re,
i = self.ck_dq_i[bit],
o = dq_i_bs,
)
for i in range(2*nphases):
self.comb += self.dfi.phases[i//2].rddata[i%2 * self.databits + bit].eq(dq_i_bs[i])
# DQS --------------------------------------------------------------------------------------
dqs_oe = Signal()
dqs_preamble = Signal()
dqs_postamble = Signal()
dqs_pattern = DQSPattern(
preamble = dqs_preamble, # FIXME: are defined the opposite way (common.py) ???
postamble = dqs_postamble,
wlevel_en = self._wlevel_en.storage,
wlevel_strobe = self._wlevel_strobe.re)
self.submodules += dqs_pattern
self.comb += [
self.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)),
]
for bit in range(self.databits//8):
# output
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re,
i = dqs_pattern.o,
o = self.ck_dqs_o[bit],
)
# DMI --------------------------------------------------------------------------------------
# DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values.
# With DM and DBI disabled, this signal is a Don't Care.
# With DM enabled, masking is performed only when the command used is WRITE-MASKED.
# TODO: use WRITE-MASKED for all write commands, and configure Mode Registers for that
# during DRAM initialization (we don't want to support DBI).
for bin in range(self.databits//8):
self.comb += self.ck_dmi_o[bit].eq(0)
# Read Control Path ------------------------------------------------------------------------
# Creates a delay line of read commands coming from the DFI interface. The output is used to
# signal a valid read data to the DFI interface.
#
# The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
# interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies.
rddata_en = TappedDelayLine(
signal = reduce(or_, [dfi.phases[i].rddata_en for i in range(nphases)]),
ntaps = self.settings.read_latency
)
self.submodules += rddata_en
self.comb += [phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases]
# Write Control Path -----------------------------------------------------------------------
wrtap = cwl_sys_latency - 1
assert wrtap >= 1
# Create a delay line of write commands coming from the DFI interface. This taps are used to
# control DQ/DQS tristates.
wrdata_en = TappedDelayLine(
signal = reduce(or_, [dfi.phases[i].wrdata_en for i in range(nphases)]),
ntaps = wrtap + 2
)
self.submodules += wrdata_en
self.comb += dq_oe.eq(wrdata_en.taps[wrtap])
self.comb += If(self._wlevel_en.storage, dqs_oe.eq(1)).Else(dqs_oe.eq(dqs_preamble | dq_oe | dqs_postamble))
# Write DQS Postamble/Preamble Control Path ------------------------------------------------
# Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
# write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles:
# 1 for Preamble, 1 for the Write and 1 for the Postamble.
self.comb += dqs_preamble.eq( wrdata_en.taps[wrtap - 1] & ~wrdata_en.taps[wrtap + 0])
self.comb += dqs_postamble.eq(wrdata_en.taps[wrtap + 1] & ~wrdata_en.taps[wrtap + 0])

View File

@ -0,0 +1,151 @@
import re
from migen import *
# MPC (multipurpose command) can be used to perform different actions
# We use ZQC with BA=0 to issue MPC, where OP[6:0] = A[6:0]
MPC = {
"NOP": 0b0000000, # only OP[6] must be 0
"READ-FIFO": 0b1000001,
"READ-DQ-CAL": 0b1000011,
# RFU: 0b1000101
"WRITE-FIFO": 0b1000111,
# RFU: 0b1001001
"START-DQS-OSC": 0b1001011,
"STOP-DQS-OSC": 0b1001101,
"ZQC-START": 0b1001111,
"ZQC-LATCH": 0b1010001,
}
class DFIPhaseAdapter(Module):
"""Translates DFI phase into LPDDR4 command (2- or 4-cycle)
LPDDR4 "full command" consists of 1 or 2 "small commands". Each "small command"
is transmitted over 2 DRAM clock cycles (SDR). This module translates DFI commands
on a single DFI phase into sequencs on CS/CA[5:0] buses (4 cycles). Some DFI commands
consist only of a single "small command". To make counting DRAM timings easier, such
a "small command" shall be sent on the 2nd slot (i.e. 3rd and 4th cycle). All timings
are then counted starting from CS low on the 4th cycle.
"""
def __init__(self, dfi_phase):
# CS/CA values for 4 SDR cycles
self.cs = Signal(4)
self.ca = Array([Signal(6) for _ in range(4)])
self.valid = Signal()
# # #
self.submodules.cmd1 = Command(dfi_phase)
self.submodules.cmd2 = Command(dfi_phase)
self.comb += [
self.cs[:2].eq(self.cmd1.cs),
self.cs[2:].eq(self.cmd2.cs),
self.ca[0].eq(self.cmd1.ca[0]),
self.ca[1].eq(self.cmd1.ca[1]),
self.ca[2].eq(self.cmd2.ca[0]),
self.ca[3].eq(self.cmd2.ca[1]),
]
dfi_cmd = Signal(3)
self.comb += dfi_cmd.eq(Cat(~dfi_phase.we_n, ~dfi_phase.ras_n, ~dfi_phase.cas_n)),
_cmd = { # cas, ras, we
"NOP": 0b000,
"ACT": 0b010,
"RD": 0b100,
"WR": 0b101,
"PRE": 0b011,
"REF": 0b110,
"ZQC": 0b001,
"MRS": 0b111,
}
def cmds(cmd1, cmd2, valid=1):
return self.cmd1.set(cmd1) + self.cmd2.set(cmd2) + [self.valid.eq(valid)]
self.comb += If(dfi_phase.cs_n == 0, # require dfi.cs_n
Case(dfi_cmd, {
_cmd["ACT"]: cmds("ACTIVATE-1", "ACTIVATE-2"),
_cmd["RD"]: cmds("READ-1", "CAS-2"),
_cmd["WR"]: cmds("WRITE-1", "CAS-2"), # TODO: masked write
_cmd["PRE"]: cmds("DESELECT", "PRECHARGE"),
_cmd["REF"]: cmds("DESELECT", "REFRESH"),
_cmd["ZQC"]: cmds("DESELECT", "MPC"),
_cmd["MRS"]: cmds("MRW-1", "MRW-2"),
"default": cmds("DESELECT", "DESELECT", valid=0),
})
)
class Command(Module):
"""LPDDR4 command decoder
Decodes a command from single DFI phase into LPDDR4 "small command"
consisting of 2 CS values and 2 CA[5:0] values.
LPDDR4 "small commands" are transmited over 2 clock cycles. In first
cycle CS is driven high and in the second cycle it stays low. In each
of the cycles the bits on CA[5:0] are latched and interpreted differently.
This module translates a DFI command into the values of CS/CA that shall
be transmitted over 2 DRAM clock cycles.
"""
# String description of 1st and 2nd edge of each command, later parsed to
# construct the value. CS is assumed to be H for 1st edge and L for 2nd edge.
TRUTH_TABLE = {
"MRW-1": ["L H H L L OP7", "MA0 MA1 MA2 MA3 MA4 MA5"],
"MRW-2": ["L H H L H OP6", "OP0 OP1 OP2 OP3 OP4 OP5"],
"MRR-1": ["L H H H L V", "MA0 MA1 MA2 MA3 MA4 MA5"],
"REFRESH": ["L L L H L AB", "BA0 BA1 BA2 V V V"],
"ACTIVATE-1": ["H L R12 R13 R14 R15", "BA0 BA1 BA2 R16 R10 R11"],
"ACTIVATE-2": ["H H R6 R7 R8 R9", "R0 R1 R2 R3 R4 R5"],
"WRITE-1": ["L L H L L BL", "BA0 BA1 BA2 V C9 AP"],
"MASK WRITE-1": ["L L H H L BL", "BA0 BA1 BA2 V C9 AP"],
"READ-1": ["L H L L L BL", "BA0 BA1 BA2 V C9 AP"],
"CAS-2": ["L H L L H C8", "C2 C3 C4 C5 C6 C7"],
"PRECHARGE": ["L L L L H AB", "BA0 BA1 BA2 V V V"],
"MPC": ["L L L L L OP6", "OP0 OP1 OP2 OP3 OP4 OP5"],
"DESELECT": ["X X X X X X", "X X X X X X"],
}
for cmd, (subcmd1, subcmd2) in TRUTH_TABLE.items():
assert len(subcmd1.split()) == 6, (cmd, subcmd1)
assert len(subcmd2.split()) == 6, (cmd, subcmd2)
def __init__(self, dfi_phase):
self.cs = Signal(2)
self.ca = Array([Signal(6), Signal(6)]) # CS high, CS low
self.dfi = dfi_phase
def set(self, cmd):
ops = []
for i, description in enumerate(self.TRUTH_TABLE[cmd]):
for j, bit in enumerate(description.split()):
ops.append(self.ca[i][j].eq(self.parse_bit(bit, is_mpc=cmd == "MPC")))
if cmd != "DESELECT":
ops.append(self.cs[0].eq(1))
return ops
def parse_bit(self, bit, is_mpc=False):
rules = {
"H": lambda: 1, # high
"L": lambda: 0, # low
"V": lambda: 0, # defined logic
"X": lambda: 0, # don't care
"BL": lambda: 0, # on-the-fly burst length, not using
"AP": lambda: self.dfi.address[10], # auto precharge
"AB": lambda: self.dfi.address[10], # all banks
"BA(\d+)": lambda i: self.dfi.bank[i],
"R(\d+)": lambda i: self.dfi.address[i], # row
"C(\d+)": lambda i: self.dfi.address[i], # column
"MA(\d+)": lambda i: self.dfi.address[8+i], # mode register address
"OP(\d+)": lambda i: self.dfi.address[i], # mode register value, or operand for MPC
}
for pattern, value in rules.items():
m = re.match(pattern, bit)
if m:
args = [int(g) for g in m.groups()]
return value(*args)
raise ValueError(bit)

View File

@ -4,8 +4,12 @@ from litex.soc.interconnect.csr import *
from litedram.common import *
from litedram.phy.dfi import *
from litedram.phy.lpddr4phy import LPDDR4PHY, delayed
from litedram.phy.lpddr4.utils import delayed
from litedram.phy.lpddr4.basephy import LPDDR4PHY
# TODO: add option to avoid ODELAYE2, for now it won't work on Artix7
class S7LPDDR4PHY(LPDDR4PHY):
def __init__(self, pads, *, iodelay_clk_freq, **kwargs):
self.iodelay_clk_freq = iodelay_clk_freq

View File

@ -0,0 +1,192 @@
from migen import *
from litedram.phy.lpddr4.utils import delayed
from litedram.phy.lpddr4.basephy import LPDDR4PHY
class LPDDR4SimulationPads(Module):
def __init__(self, databits=16):
self.clk_p = Signal()
self.clk_n = Signal()
self.cke = Signal()
self.odt = Signal()
self.reset_n = Signal()
self.cs = Signal()
self.ca = Signal(6)
# signals for checking actual tristate lines state (PHY reads these)
self.dq = Signal(databits)
self.dqs = Signal(databits//8)
self.dmi = Signal(databits//8)
# internal tristates i/o that should be driven for simulation
self.dq_o = Signal(databits) # PHY drives these
self.dq_i = Signal(databits) # DRAM chip (simulator) drives these
self.dq_oe = Signal() # PHY drives these
self.dqs_o = Signal(databits//8)
self.dqs_i = Signal(databits//8)
self.dqs_oe = Signal()
self.dmi_o = Signal(databits//8)
self.dmi_i = Signal(databits//8)
self.dmi_oe = Signal()
self.comb += [
If(self.dq_oe, self.dq.eq(self.dq_o)).Else(self.dq.eq(self.dq_i)),
If(self.dqs_oe, self.dqs.eq(self.dqs_o)).Else(self.dqs.eq(self.dqs_i)),
If(self.dmi_oe, self.dmi.eq(self.dmi_o)).Else(self.dmi.eq(self.dmi_i)),
]
class LPDDR4SimPHY(LPDDR4PHY):
def __init__(self, sys_clk_freq=100e6, aligned_reset_zero=False):
pads = LPDDR4SimulationPads()
self.submodules += pads
super().__init__(pads,
sys_clk_freq = sys_clk_freq,
write_ser_latency = Serializer.LATENCY,
read_des_latency = Deserializer.LATENCY,
phytype = "LPDDR4SimPHY")
def add_reset_value(phase, kwargs):
if aligned_reset_zero and phase == 0:
kwargs["reset_value"] = 0
# Serialization
def serialize(**kwargs):
name = 'ser_' + kwargs.pop('name', '')
ser = Serializer(o_dw=1, name=name.strip('_'), **kwargs)
self.submodules += ser
def deserialize(**kwargs):
name = 'des_' + kwargs.pop('name', '')
des = Deserializer(i_dw=1, name=name.strip('_'), **kwargs)
self.submodules += des
def ser_sdr(phase=0, **kwargs):
clkdiv = {0: "sys8x", 90: "sys8x_90"}[phase]
# clk = {0: "sys", 90: "sys_11_25"}[phase]
clk = {0: "sys", 90: "sys"}[phase]
add_reset_value(phase, kwargs)
serialize(clk=clk, clkdiv=clkdiv, i_dw=8, **kwargs)
def ser_ddr(phase=0, **kwargs):
# for simulation we require sys8x_ddr clock (=sys16x)
clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase]
# clk = {0: "sys", 90: "sys_11_25"}[phase]
clk = {0: "sys", 90: "sys"}[phase]
add_reset_value(phase, kwargs)
serialize(clk=clk, clkdiv=clkdiv, i_dw=16, **kwargs)
def des_ddr(phase=0, **kwargs):
clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase]
clk = {0: "sys", 90: "sys_11_25"}[phase]
add_reset_value(phase, kwargs)
deserialize(clk=clk, clkdiv=clkdiv, o_dw=16, **kwargs)
# Clock is shifted 180 degrees to get rising edge in the middle of SDR signals.
# To achieve that we send negated clock on clk_p and non-negated on clk_n.
ser_ddr(i=~self.ck_clk, o=self.pads.clk_p, name='clk_p')
ser_ddr(i=self.ck_clk, o=self.pads.clk_n, name='clk_n')
ser_sdr(i=self.ck_cke, o=self.pads.cke, name='cke')
ser_sdr(i=self.ck_odt, o=self.pads.odt, name='odt')
ser_sdr(i=self.ck_reset_n, o=self.pads.reset_n, name='reset_n')
# Command/address
ser_sdr(i=self.ck_cs, o=self.pads.cs, name='cs')
for i in range(6):
ser_sdr(i=self.ck_ca[i], o=self.pads.ca[i], name=f'ca{i}')
# Tristate I/O (separate for simulation)
for i in range(self.databits//8):
ser_ddr(i=self.ck_dmi_o[i], o=self.pads.dmi_o[i], name=f'dmi_o{i}')
des_ddr(o=self.ck_dmi_i[i], i=self.pads.dmi[i], name=f'dmi_i{i}')
ser_ddr(i=self.ck_dqs_o[i], o=self.pads.dqs_o[i], name=f'dqs_o{i}', phase=90)
des_ddr(o=self.ck_dqs_i[i], i=self.pads.dqs[i], name=f'dqs_i{i}', phase=90)
for i in range(self.databits):
ser_ddr(i=self.ck_dq_o[i], o=self.pads.dq_o[i], name=f'dq_o{i}')
des_ddr(o=self.ck_dq_i[i], i=self.pads.dq[i], name=f'dq_i{i}')
# Output enable signals
self.comb += self.pads.dmi_oe.eq(delayed(self, self.dmi_oe, cycles=Serializer.LATENCY))
self.comb += self.pads.dqs_oe.eq(delayed(self, self.dqs_oe, cycles=Serializer.LATENCY))
self.comb += self.pads.dq_oe.eq(delayed(self, self.dq_oe, cycles=Serializer.LATENCY))
class Serializer(Module):
"""Serialize given input signal
It latches the input data on the rising edge of `clk`. Output data counter `cnt` is incremented
on rising edges of `clkdiv` and it determines current slice of `i` that is presented on `o`.
`latency` is specified in `clk` cycles.
NOTE: both `clk` and `clkdiv` should be phase aligned.
NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0
"""
LATENCY = 1
def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None):
assert i_dw > o_dw
assert i_dw % o_dw == 0
ratio = i_dw // o_dw
sd_clk = getattr(self.sync, clk)
sd_clkdiv = getattr(self.sync, clkdiv)
if i is None: i = Signal(i_dw)
if o is None: o = Signal(o_dw)
if reset is None: reset = Signal()
self.i = i
self.o = o
self.reset = reset
if reset_value < 0:
reset_value = ratio + reset_value
cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None)
sd_clkdiv += If(reset | cnt == ratio - 1, cnt.eq(0)).Else(cnt.eq(cnt + 1))
i_d = Signal.like(self.i)
sd_clk += i_d.eq(self.i)
i_array = Array([i_d[n*o_dw:(n+1)*o_dw] for n in range(ratio)])
self.comb += self.o.eq(i_array[cnt])
class Deserializer(Module):
"""Deserialize given input signal
Latches the input data on the rising edges of `clkdiv` and stores them in the `o_pre` buffer.
Additional latency cycle is used to ensure that the last input bit is deserialized correctly.
NOTE: both `clk` and `clkdiv` should be phase aligned.
NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0
"""
LATENCY = 2
def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None):
assert i_dw < o_dw
assert o_dw % i_dw == 0
ratio = o_dw // i_dw
sd_clk = getattr(self.sync, clk)
sd_clkdiv = getattr(self.sync, clkdiv)
if i is None: i = Signal(i_dw)
if o is None: o = Signal(o_dw)
if reset is None: reset = Signal()
self.i = i
self.o = o
self.reset = reset
if reset_value < 0:
reset_value = ratio + reset_value
cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None)
sd_clkdiv += If(reset, cnt.eq(0)).Else(cnt.eq(cnt + 1))
o_pre = Signal.like(self.o)
o_array = Array([o_pre[n*i_dw:(n+1)*i_dw] for n in range(ratio)])
sd_clkdiv += o_array[cnt].eq(self.i)
# we need to ensure that the last bit will be correct if clocks are phase aligned
o_pre_d = Signal.like(self.o)
sd_clk += o_pre_d.eq(o_pre)
sd_clk += self.o.eq(Cat(o_pre_d[:-1], o_pre[-1])) # would work as self.comb (at least in simulation)

View File

@ -0,0 +1,71 @@
from functools import reduce
from operator import or_
from migen import *
from litedram.common import TappedDelayLine
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
def bitpattern(s):
if len(s) > 8:
return reduce(or_, [bitpattern(si) << (8*i) for i, si in enumerate(chunks(s, 8))])
assert len(s) == 8
s = s.translate(s.maketrans("_-", "01"))
return int(s[::-1], 2) # LSB first, so reverse the string
def delayed(mod, sig, cycles=1):
delay = TappedDelayLine(signal=sig, ntaps=cycles)
mod.submodules += delay
return delay.output
class ConstBitSlip(Module):
def __init__(self, dw, i=None, o=None, slp=None, cycles=1):
self.i = Signal(dw, name='i') if i is None else i
self.o = Signal(dw, name='o') if o is None else o
assert cycles >= 1
assert 0 <= slp <= cycles*dw-1
slp = (cycles*dw-1) - slp
# # #
self.r = r = Signal((cycles+1)*dw, reset_less=True)
self.sync += r.eq(Cat(r[dw:], self.i))
cases = {}
for i in range(cycles*dw):
cases[i] = self.o.eq(r[i+1:dw+i+1])
self.comb += Case(slp, cases)
# TODO: rewrite DQSPattern in litedram/common.py to support different data widths
class DQSPattern(Module):
def __init__(self, preamble=None, postamble=None, wlevel_en=0, wlevel_strobe=0, register=False):
self.preamble = Signal() if preamble is None else preamble
self.postamble = Signal() if postamble is None else postamble
self.o = Signal(16)
# # #
# DQS Pattern transmitted as LSB-first.
self.comb += [
self.o.eq(0b0101010101010101),
If(self.preamble,
self.o.eq(0b0001010101010101)
),
If(self.postamble,
self.o.eq(0b0101010101010100)
),
If(wlevel_en,
self.o.eq(0b0000000000000000),
If(wlevel_strobe,
self.o.eq(0b0000000000000001)
)
)
]
if register:
o = Signal.like(self.o)
self.sync += o.eq(self.o)
self.o = o

View File

@ -1,695 +0,0 @@
import re
from functools import reduce
from operator import or_
from collections import defaultdict
import math
from migen import *
from litex.soc.interconnect.csr import *
from litedram.common import *
from litedram.phy.dfi import *
def _chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i + n]
def bitpattern(s):
if len(s) > 8:
return reduce(or_, [bitpattern(si) << (8*i) for i, si in enumerate(_chunks(s, 8))])
assert len(s) == 8
s = s.translate(s.maketrans("_-", "01"))
return int(s[::-1], 2) # LSB first, so reverse the string
def delayed(mod, sig, cycles=1):
delay = TappedDelayLine(signal=sig, ntaps=cycles)
mod.submodules += delay
return delay.output
class ConstBitSlip(Module):
def __init__(self, dw, i=None, o=None, slp=None, cycles=1):
self.i = Signal(dw, name='i') if i is None else i
self.o = Signal(dw, name='o') if o is None else o
assert cycles >= 1
assert 0 <= slp <= cycles*dw-1
slp = (cycles*dw-1) - slp
# # #
self.r = r = Signal((cycles+1)*dw, reset_less=True)
self.sync += r.eq(Cat(r[dw:], self.i))
cases = {}
for i in range(cycles*dw):
cases[i] = self.o.eq(r[i+1:dw+i+1])
self.comb += Case(slp, cases)
# TODO: rewrite DQSPattern in common.py to support different data widths
class DQSPattern(Module):
def __init__(self, preamble=None, postamble=None, wlevel_en=0, wlevel_strobe=0, register=False):
self.preamble = Signal() if preamble is None else preamble
self.postamble = Signal() if postamble is None else postamble
self.o = Signal(16)
# # #
# DQS Pattern transmitted as LSB-first.
self.comb += [
self.o.eq(0b0101010101010101),
If(self.preamble,
self.o.eq(0b0001010101010101)
),
If(self.postamble,
self.o.eq(0b0101010101010100)
),
If(wlevel_en,
self.o.eq(0b0000000000000000),
If(wlevel_strobe,
self.o.eq(0b0000000000000001)
)
)
]
if register:
o = Signal.like(self.o)
self.sync += o.eq(self.o)
self.o = o
# LPDDR4PHY ----------------------------------------------------------------------------------------
class LPDDR4PHY(Module, AutoCSR):
def __init__(self, pads, *,
sys_clk_freq, write_ser_latency, read_des_latency, phytype, cmd_delay=None):
self.pads = pads
self.memtype = memtype = "LPDDR4"
self.nranks = nranks = 1 if not hasattr(pads, "cs_n") else len(pads.cs_n)
self.databits = databits = len(pads.dq)
self.addressbits = addressbits = 17 # for activate row address
self.bankbits = bankbits = 3
self.nphases = nphases = 8
self.tck = tck = 1 / (nphases*sys_clk_freq)
assert databits % 8 == 0
# Parameters -------------------------------------------------------------------------------
def get_cl_cw(memtype, tck):
# MT53E256M16D1, No DBI, Set A
f_to_cl_cwl = OrderedDict()
f_to_cl_cwl[ 532e6] = ( 6, 4) # FIXME: with that low cwl, wrtap is 0
f_to_cl_cwl[1066e6] = (10, 6)
f_to_cl_cwl[1600e6] = (14, 8)
f_to_cl_cwl[2132e6] = (20, 10)
f_to_cl_cwl[2666e6] = (24, 12)
f_to_cl_cwl[3200e6] = (28, 14)
f_to_cl_cwl[3732e6] = (32, 16)
f_to_cl_cwl[4266e6] = (36, 18)
for f, (cl, cwl) in f_to_cl_cwl.items():
if tck >= 2/f:
return cl, cwl
raise ValueError
# Bitslip introduces latency between from `cycles` up to `cycles + 1`
bitslip_cycles = 1
# Commands are sent over 4 cycles of DRAM clock (sys8x)
cmd_latency = 4
# Commands read from adapters are delayed on ConstBitSlips
ca_latency = 1
cl, cwl = get_cl_cw(memtype, tck)
cl_sys_latency = get_sys_latency(nphases, cl)
cwl_sys_latency = get_sys_latency(nphases, cwl)
rdphase = get_sys_phase(nphases, cl_sys_latency, cl + cmd_latency)
wrphase = get_sys_phase(nphases, cwl_sys_latency, cwl + cmd_latency)
# When the calculated phase is negative, it means that we need to increase sys latency
def updated_latency(phase):
delay_update = 0
while phase < 0:
phase += nphases
delay_update += 1
return phase, delay_update
wrphase, cwl_sys_delay = updated_latency(wrphase)
rdphase, cl_sys_delay = updated_latency(rdphase)
cwl_sys_latency += cwl_sys_delay
cl_sys_latency += cl_sys_delay
# Read latency
read_data_delay = ca_latency + write_ser_latency + cl_sys_latency # DFI cmd -> read data on DQ
read_des_delay = read_des_latency + bitslip_cycles # data on DQ -> data on DFI rddata
read_latency = read_data_delay + read_des_delay
# Write latency
write_latency = cwl_sys_latency
# FIXME: remove
if __import__("os").environ.get("DEBUG") == '1':
print('cl', end=' = '); __import__('pprint').pprint(cl)
print('cwl', end=' = '); __import__('pprint').pprint(cwl)
print('cl_sys_latency', end=' = '); __import__('pprint').pprint(cl_sys_latency)
print('cwl_sys_latency', end=' = '); __import__('pprint').pprint(cwl_sys_latency)
print('rdphase', end=' = '); __import__('pprint').pprint(rdphase)
print('wrphase', end=' = '); __import__('pprint').pprint(wrphase)
print('read_data_delay', end=' = '); __import__('pprint').pprint(read_data_delay)
print('read_des_delay', end=' = '); __import__('pprint').pprint(read_des_delay)
print('read_latency', end=' = '); __import__('pprint').pprint(read_latency)
print('write_latency', end=' = '); __import__('pprint').pprint(write_latency)
# Registers --------------------------------------------------------------------------------
self._rst = CSRStorage()
self._dly_sel = CSRStorage(databits//8)
self._wlevel_en = CSRStorage()
self._wlevel_strobe = CSR()
self._dly_sel = CSRStorage(databits//8)
self._rdly_dq_bitslip_rst = CSR()
self._rdly_dq_bitslip = CSR()
self._wdly_dq_bitslip_rst = CSR()
self._wdly_dq_bitslip = CSR()
self._rdphase = CSRStorage(int(math.log2(nphases)), reset=rdphase)
self._wrphase = CSRStorage(int(math.log2(nphases)), reset=wrphase)
# PHY settings -----------------------------------------------------------------------------
self.settings = PhySettings(
phytype = phytype,
memtype = memtype,
databits = databits,
dfi_databits = 2*databits,
nranks = nranks,
nphases = nphases,
rdphase = self._rdphase.storage,
wrphase = self._wrphase.storage,
cl = cl,
cwl = cwl,
read_latency = read_latency,
write_latency = write_latency,
cmd_latency = cmd_latency,
cmd_delay = cmd_delay,
)
# DFI Interface ----------------------------------------------------------------------------
# Due to the fact that LPDDR4 has 16n prefetch we use 8 phases to be able to read/write a
# whole burst during a single controller clock cycle. PHY should use sys8x clock.
self.dfi = dfi = Interface(addressbits, bankbits, nranks, 2*databits, nphases=8)
# # #
adapters = [DFIPhaseAdapter(phase) for phase in self.dfi.phases]
self.submodules += adapters
# Now prepare the data by converting the sequences on adapters into sequences on the pads.
# We have to ignore overlapping commands, and module timings have to ensure that there are
# no overlapping commands anyway.
# Pads: reset_n, CS, CKE, CK, CA[5:0], DMI[1:0], DQ[15:0], DQS[1:0], ODT_CA
self.ck_clk = Signal(2*nphases)
self.ck_cke = Signal(nphases)
self.ck_odt = Signal(nphases)
self.ck_reset_n = Signal(nphases)
self.ck_cs = Signal(nphases)
self.ck_ca = [Signal(nphases) for _ in range(6)]
self.ck_dmi_o = [Signal(2*nphases) for _ in range(2)]
self.ck_dmi_i = [Signal(2*nphases) for _ in range(2)]
self.dmi_oe = Signal()
self.ck_dq_o = [Signal(2*nphases) for _ in range(databits)]
self.ck_dq_i = [Signal(2*nphases) for _ in range(databits)]
self.dq_oe = Signal()
self.ck_dqs_o = [Signal(2*nphases) for _ in range(2)]
self.ck_dqs_i = [Signal(2*nphases) for _ in range(2)]
self.dqs_oe = Signal()
# Clocks -----------------------------------------------------------------------------------
self.comb += self.ck_clk.eq(bitpattern("-_-_-_-_" * 2))
# Simple commands --------------------------------------------------------------------------
self.comb += [
self.ck_cke.eq(Cat(delayed(self, phase.cke) for phase in self.dfi.phases)),
self.ck_odt.eq(Cat(delayed(self, phase.odt) for phase in self.dfi.phases)),
self.ck_reset_n.eq(Cat(delayed(self, phase.reset_n) for phase in self.dfi.phases)),
]
# LPDDR4 Commands --------------------------------------------------------------------------
# Each command can span several phases (up to 4), so we must ignore overlapping commands,
# but in general, module timings should be set in a way that overlapping will never happen.
# Create a history of valid adapters used for masking overlapping ones.
# TODO: make optional, as it takes up resources and the controller should ensure no overlaps
valids = ConstBitSlip(dw=nphases, cycles=1, slp=0)
self.submodules += valids
self.comb += valids.i.eq(Cat(a.valid for a in adapters))
# valids_hist = valids.r
valids_hist = Signal.like(valids.r)
# TODO: especially make this part optional
for i in range(len(valids_hist)):
was_valid_before = reduce(or_, valids_hist[max(0, i-3):i], 0)
self.comb += valids_hist[i].eq(valids.r[i] & ~was_valid_before)
cs_per_adapter = []
ca_per_adapter = defaultdict(list)
for phase, adapter in enumerate(adapters):
# The signals from an adapter can be used if there were no commands on 3 previous cycles
allowed = ~reduce(or_, valids_hist[nphases+phase - 3:nphases+phase])
# Use CS and CA of given adapter slipped by `phase` bits
cs_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
self.submodules += cs_bs
self.comb += cs_bs.i.eq(Cat(adapter.cs)),
cs_mask = Replicate(allowed, len(cs_bs.o))
cs = cs_bs.o & cs_mask
cs_per_adapter.append(cs)
# For CA we need to do the same for each bit
ca_bits = []
for bit in range(6):
ca_bs = ConstBitSlip(dw=nphases, cycles=1, slp=phase)
self.submodules += ca_bs
ca_bit_hist = [adapter.ca[i][bit] for i in range(4)]
self.comb += ca_bs.i.eq(Cat(*ca_bit_hist)),
ca_mask = Replicate(allowed, len(ca_bs.o))
ca = ca_bs.o & ca_mask
ca_per_adapter[bit].append(ca)
# OR all the masked signals
self.comb += self.ck_cs.eq(reduce(or_, cs_per_adapter))
for bit in range(6):
self.comb += self.ck_ca[bit].eq(reduce(or_, ca_per_adapter[bit]))
# DQ ---------------------------------------------------------------------------------------
dq_oe = Signal()
self.comb += self.dq_oe.eq(delayed(self, dq_oe, cycles=1))
for bit in range(self.databits):
# output
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re,
i = Cat(*[self.dfi.phases[i//2].wrdata[i%2 * self.databits + bit] for i in range(2*nphases)]),
o = self.ck_dq_o[bit],
)
# input
dq_i_bs = Signal(2*nphases)
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._rdly_dq_bitslip.re,
i = self.ck_dq_i[bit],
o = dq_i_bs,
)
for i in range(2*nphases):
self.comb += self.dfi.phases[i//2].rddata[i%2 * self.databits + bit].eq(dq_i_bs[i])
# DQS --------------------------------------------------------------------------------------
dqs_oe = Signal()
dqs_preamble = Signal()
dqs_postamble = Signal()
dqs_pattern = DQSPattern(
preamble = dqs_preamble, # FIXME: are defined the opposite way (common.py) ???
postamble = dqs_postamble,
wlevel_en = self._wlevel_en.storage,
wlevel_strobe = self._wlevel_strobe.re)
self.submodules += dqs_pattern
self.comb += [
self.dqs_oe.eq(delayed(self, dqs_oe, cycles=1)),
]
for bit in range(self.databits//8):
# output
self.submodules += BitSlip(
dw = 2*nphases,
cycles = bitslip_cycles,
rst = (self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip_rst.re) | self._rst.storage,
slp = self._dly_sel.storage[bit//8] & self._wdly_dq_bitslip.re,
i = dqs_pattern.o,
o = self.ck_dqs_o[bit],
)
# DMI --------------------------------------------------------------------------------------
# DMI signal is used for Data Mask or Data Bus Invertion depending on Mode Registers values.
# With DM and DBI disabled, this signal is a Don't Care.
# With DM enabled, masking is performed only when the command used is WRITE-MASKED.
# TODO: use WRITE-MASKED for all write commands, and configure Mode Registers for that
# during DRAM initialization (we don't want to support DBI).
for bin in range(self.databits//8):
self.comb += self.ck_dmi_o[bit].eq(0)
# Read Control Path ------------------------------------------------------------------------
# Creates a delay line of read commands coming from the DFI interface. The output is used to
# signal a valid read data to the DFI interface.
#
# The read data valid is asserted for 1 sys_clk cycle when the data is available on the DFI
# interface, the latency is the sum of the OSERDESE2, CAS, ISERDESE2 and Bitslip latencies.
rddata_en = TappedDelayLine(
signal = reduce(or_, [dfi.phases[i].rddata_en for i in range(nphases)]),
ntaps = self.settings.read_latency
)
self.submodules += rddata_en
self.comb += [phase.rddata_valid.eq(rddata_en.output | self._wlevel_en.storage) for phase in dfi.phases]
# Write Control Path -----------------------------------------------------------------------
wrtap = cwl_sys_latency - 1
assert wrtap >= 1
# Create a delay line of write commands coming from the DFI interface. This taps are used to
# control DQ/DQS tristates.
wrdata_en = TappedDelayLine(
signal = reduce(or_, [dfi.phases[i].wrdata_en for i in range(nphases)]),
ntaps = wrtap + 2
)
self.submodules += wrdata_en
self.comb += dq_oe.eq(wrdata_en.taps[wrtap])
self.comb += If(self._wlevel_en.storage, dqs_oe.eq(1)).Else(dqs_oe.eq(dqs_preamble | dq_oe | dqs_postamble))
# Write DQS Postamble/Preamble Control Path ------------------------------------------------
# Generates DQS Preamble 1 cycle before the first write and Postamble 1 cycle after the last
# write. During writes, DQS tristate is configured as output for at least 3 sys_clk cycles:
# 1 for Preamble, 1 for the Write and 1 for the Postamble.
self.comb += dqs_preamble.eq( wrdata_en.taps[wrtap - 1] & ~wrdata_en.taps[wrtap + 0])
self.comb += dqs_postamble.eq(wrdata_en.taps[wrtap + 1] & ~wrdata_en.taps[wrtap + 0])
class DFIPhaseAdapter(Module):
# We must perform mapping of DFI commands to the LPDDR4 commands set on CA bus.
# LPDDR4 "small command" consists of 2 words CA[5:0] sent on the bus in 2 subsequent
# cycles. First cycle is marked with CS high, second with CS low.
# Then most "big commands" consist of 2 "small commands" (e.g. ACTIVATE-1, ACTIVATE-2).
# If a command uses 1 "small command", then it shall go as cmd2 so that all command
# timings can be counted from the same moment (cycle of cmd2 CS low).
# MPC (multipurpose command) can be used to perform different actions
# We use ZQC with BA=0 to issue MPC, where OP[6:0] = A[6:0]
MPC = {
"NOP": 0b0000000, # only OP[6] must be 0
"READ-FIFO": 0b1000001,
"READ-DQ-CAL": 0b1000011,
# RFU: 0b1000101
"WRITE-FIFO": 0b1000111,
# RFU: 0b1001001
"START-DQS-OSC": 0b1001011,
"STOP-DQS-OSC": 0b1001101,
"ZQC-START": 0b1001111,
"ZQC-LATCH": 0b1010001,
}
def __init__(self, dfi_phase):
# CS/CA values for 4 SDR cycles
self.cs = Signal(4)
self.ca = Array([Signal(6) for _ in range(4)])
self.valid = Signal()
# # #
self.submodules.cmd1 = Command(dfi_phase)
self.submodules.cmd2 = Command(dfi_phase)
self.comb += [
self.cs[:2].eq(self.cmd1.cs),
self.cs[2:].eq(self.cmd2.cs),
self.ca[0].eq(self.cmd1.ca[0]),
self.ca[1].eq(self.cmd1.ca[1]),
self.ca[2].eq(self.cmd2.ca[0]),
self.ca[3].eq(self.cmd2.ca[1]),
]
dfi_cmd = Signal(3)
self.comb += dfi_cmd.eq(Cat(~dfi_phase.we_n, ~dfi_phase.ras_n, ~dfi_phase.cas_n)),
_cmd = { # cas, ras, we
"NOP": 0b000,
"ACT": 0b010,
"RD": 0b100,
"WR": 0b101,
"PRE": 0b011,
"REF": 0b110,
"ZQC": 0b001,
"MRS": 0b111,
}
def cmds(cmd1, cmd2, valid=1):
return self.cmd1.set(cmd1) + self.cmd2.set(cmd2) + [self.valid.eq(valid)]
self.comb += If(dfi_phase.cs_n == 0, # require dfi.cs_n
Case(dfi_cmd, {
_cmd["ACT"]: cmds("ACTIVATE-1", "ACTIVATE-2"),
_cmd["RD"]: cmds("READ-1", "CAS-2"),
_cmd["WR"]: cmds("WRITE-1", "CAS-2"), # TODO: masked write
_cmd["PRE"]: cmds("DESELECT", "PRECHARGE"),
_cmd["REF"]: cmds("DESELECT", "REFRESH"),
_cmd["ZQC"]: cmds("DESELECT", "MPC"),
_cmd["MRS"]: cmds("MRW-1", "MRW-2"),
"default": cmds("DESELECT", "DESELECT", valid=0),
})
)
class Command(Module):
# String description of 1st and 2nd edge of each command, later parsed to construct
# the value. CS is assumed to be H for 1st edge and L for 2nd edge.
TRUTH_TABLE = {
"MRW-1": ["L H H L L OP7", "MA0 MA1 MA2 MA3 MA4 MA5"],
"MRW-2": ["L H H L H OP6", "OP0 OP1 OP2 OP3 OP4 OP5"],
"MRR-1": ["L H H H L V", "MA0 MA1 MA2 MA3 MA4 MA5"],
"REFRESH": ["L L L H L AB", "BA0 BA1 BA2 V V V"],
"ACTIVATE-1": ["H L R12 R13 R14 R15", "BA0 BA1 BA2 R16 R10 R11"],
"ACTIVATE-2": ["H H R6 R7 R8 R9", "R0 R1 R2 R3 R4 R5"],
"WRITE-1": ["L L H L L BL", "BA0 BA1 BA2 V C9 AP"],
"MASK WRITE-1": ["L L H H L BL", "BA0 BA1 BA2 V C9 AP"],
"READ-1": ["L H L L L BL", "BA0 BA1 BA2 V C9 AP"],
"CAS-2": ["L H L L H C8", "C2 C3 C4 C5 C6 C7"],
"PRECHARGE": ["L L L L H AB", "BA0 BA1 BA2 V V V"],
"MPC": ["L L L L L OP6", "OP0 OP1 OP2 OP3 OP4 OP5"],
"DESELECT": ["X X X X X X", "X X X X X X"],
}
for cmd, (subcmd1, subcmd2) in TRUTH_TABLE.items():
assert len(subcmd1.split()) == 6, (cmd, subcmd1)
assert len(subcmd2.split()) == 6, (cmd, subcmd2)
def __init__(self, dfi_phase):
self.cs = Signal(2)
self.ca = Array([Signal(6), Signal(6)]) # CS high, CS low
self.dfi = dfi_phase
def set(self, cmd):
ops = []
for i, description in enumerate(self.TRUTH_TABLE[cmd]):
for j, bit in enumerate(description.split()):
ops.append(self.ca[i][j].eq(self.parse_bit(bit, is_mpc=cmd == "MPC")))
if cmd != "DESELECT":
ops.append(self.cs[0].eq(1))
return ops
def parse_bit(self, bit, is_mpc=False):
rules = {
"H": lambda: 1, # high
"L": lambda: 0, # low
"V": lambda: 0, # defined logic
"X": lambda: 0, # don't care
"BL": lambda: 0, # on-the-fly burst length, not using
"AP": lambda: self.dfi.address[10], # auto precharge
"AB": lambda: self.dfi.address[10], # all banks
"BA(\d+)": lambda i: self.dfi.bank[i],
"R(\d+)": lambda i: self.dfi.address[i], # row
"C(\d+)": lambda i: self.dfi.address[i], # column
"MA(\d+)": lambda i: self.dfi.address[8+i], # mode register address
"OP(\d+)": lambda i: self.dfi.address[i], # mode register value, or operand for MPC
}
for pattern, value in rules.items():
m = re.match(pattern, bit)
if m:
args = [int(g) for g in m.groups()]
return value(*args)
raise ValueError(bit)
# SimulationPHY ------------------------------------------------------------------------------------
class LPDDR4SimulationPads(Module):
def __init__(self, databits=16):
self.clk_p = Signal()
self.clk_n = Signal()
self.cke = Signal()
self.odt = Signal()
self.reset_n = Signal()
self.cs = Signal()
self.ca = Signal(6)
# signals for checking actual tristate lines state (PHY reads these)
self.dq = Signal(databits)
self.dqs = Signal(databits//8)
self.dmi = Signal(databits//8)
# internal tristates i/o that should be driven for simulation
self.dq_o = Signal(databits) # PHY drives these
self.dq_i = Signal(databits) # DRAM chip (simulator) drives these
self.dq_oe = Signal() # PHY drives these
self.dqs_o = Signal(databits//8)
self.dqs_i = Signal(databits//8)
self.dqs_oe = Signal()
self.dmi_o = Signal(databits//8)
self.dmi_i = Signal(databits//8)
self.dmi_oe = Signal()
self.comb += [
If(self.dq_oe, self.dq.eq(self.dq_o)).Else(self.dq.eq(self.dq_i)),
If(self.dqs_oe, self.dqs.eq(self.dqs_o)).Else(self.dqs.eq(self.dqs_i)),
If(self.dmi_oe, self.dmi.eq(self.dmi_o)).Else(self.dmi.eq(self.dmi_i)),
]
class SimulationPHY(LPDDR4PHY):
def __init__(self, sys_clk_freq=100e6, aligned_reset_zero=False):
pads = LPDDR4SimulationPads()
self.submodules += pads
super().__init__(pads,
sys_clk_freq = sys_clk_freq,
write_ser_latency = Serializer.LATENCY,
read_des_latency = Deserializer.LATENCY,
phytype = "SimulationPHY")
def add_reset_value(phase, kwargs):
if aligned_reset_zero and phase == 0:
kwargs["reset_value"] = 0
# Serialization
def serialize(**kwargs):
name = 'ser_' + kwargs.pop('name', '')
ser = Serializer(o_dw=1, name=name.strip('_'), **kwargs)
self.submodules += ser
def deserialize(**kwargs):
name = 'des_' + kwargs.pop('name', '')
des = Deserializer(i_dw=1, name=name.strip('_'), **kwargs)
self.submodules += des
def ser_sdr(phase=0, **kwargs):
clkdiv = {0: "sys8x", 90: "sys8x_90"}[phase]
# clk = {0: "sys", 90: "sys_11_25"}[phase]
clk = {0: "sys", 90: "sys"}[phase]
add_reset_value(phase, kwargs)
serialize(clk=clk, clkdiv=clkdiv, i_dw=8, **kwargs)
def ser_ddr(phase=0, **kwargs):
# for simulation we require sys8x_ddr clock (=sys16x)
clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase]
# clk = {0: "sys", 90: "sys_11_25"}[phase]
clk = {0: "sys", 90: "sys"}[phase]
add_reset_value(phase, kwargs)
serialize(clk=clk, clkdiv=clkdiv, i_dw=16, **kwargs)
def des_ddr(phase=0, **kwargs):
clkdiv = {0: "sys8x_ddr", 90: "sys8x_90_ddr"}[phase]
clk = {0: "sys", 90: "sys_11_25"}[phase]
add_reset_value(phase, kwargs)
deserialize(clk=clk, clkdiv=clkdiv, o_dw=16, **kwargs)
# Clock is shifted 180 degrees to get rising edge in the middle of SDR signals.
# To achieve that we send negated clock on clk_p and non-negated on clk_n.
ser_ddr(i=~self.ck_clk, o=self.pads.clk_p, name='clk_p')
ser_ddr(i=self.ck_clk, o=self.pads.clk_n, name='clk_n')
ser_sdr(i=self.ck_cke, o=self.pads.cke, name='cke')
ser_sdr(i=self.ck_odt, o=self.pads.odt, name='odt')
ser_sdr(i=self.ck_reset_n, o=self.pads.reset_n, name='reset_n')
# Command/address
ser_sdr(i=self.ck_cs, o=self.pads.cs, name='cs')
for i in range(6):
ser_sdr(i=self.ck_ca[i], o=self.pads.ca[i], name=f'ca{i}')
# Tristate I/O (separate for simulation)
for i in range(self.databits//8):
ser_ddr(i=self.ck_dmi_o[i], o=self.pads.dmi_o[i], name=f'dmi_o{i}')
des_ddr(o=self.ck_dmi_i[i], i=self.pads.dmi[i], name=f'dmi_i{i}')
ser_ddr(i=self.ck_dqs_o[i], o=self.pads.dqs_o[i], name=f'dqs_o{i}', phase=90)
des_ddr(o=self.ck_dqs_i[i], i=self.pads.dqs[i], name=f'dqs_i{i}', phase=90)
for i in range(self.databits):
ser_ddr(i=self.ck_dq_o[i], o=self.pads.dq_o[i], name=f'dq_o{i}')
des_ddr(o=self.ck_dq_i[i], i=self.pads.dq[i], name=f'dq_i{i}')
# Output enable signals
self.comb += self.pads.dmi_oe.eq(delayed(self, self.dmi_oe, cycles=Serializer.LATENCY))
self.comb += self.pads.dqs_oe.eq(delayed(self, self.dqs_oe, cycles=Serializer.LATENCY))
self.comb += self.pads.dq_oe.eq(delayed(self, self.dq_oe, cycles=Serializer.LATENCY))
class Serializer(Module):
"""Serialize given input signal
It latches the input data on the rising edge of `clk`. Output data counter `cnt` is incremented
on rising edges of `clkdiv` and it determines current slice of `i` that is presented on `o`.
`latency` is specified in `clk` cycles.
NOTE: both `clk` and `clkdiv` should be phase aligned.
NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0
"""
LATENCY = 1
def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None):
assert i_dw > o_dw
assert i_dw % o_dw == 0
ratio = i_dw // o_dw
sd_clk = getattr(self.sync, clk)
sd_clkdiv = getattr(self.sync, clkdiv)
if i is None: i = Signal(i_dw)
if o is None: o = Signal(o_dw)
if reset is None: reset = Signal()
self.i = i
self.o = o
self.reset = reset
if reset_value < 0:
reset_value = ratio + reset_value
cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None)
sd_clkdiv += If(reset | cnt == ratio - 1, cnt.eq(0)).Else(cnt.eq(cnt + 1))
i_d = Signal.like(self.i)
sd_clk += i_d.eq(self.i)
i_array = Array([i_d[n*o_dw:(n+1)*o_dw] for n in range(ratio)])
self.comb += self.o.eq(i_array[cnt])
class Deserializer(Module):
"""Deserialize given input signal
Latches the input data on the rising edges of `clkdiv` and stores them in the `o_pre` buffer.
Additional latency cycle is used to ensure that the last input bit is deserialized correctly.
NOTE: both `clk` and `clkdiv` should be phase aligned.
NOTE: `reset_value` is set to `ratio - 1` so that on the first clock edge after reset it is 0
"""
LATENCY = 2
def __init__(self, clk, clkdiv, i_dw, o_dw, i=None, o=None, reset=None, reset_value=-1, name=None):
assert i_dw < o_dw
assert o_dw % i_dw == 0
ratio = o_dw // i_dw
sd_clk = getattr(self.sync, clk)
sd_clkdiv = getattr(self.sync, clkdiv)
if i is None: i = Signal(i_dw)
if o is None: o = Signal(o_dw)
if reset is None: reset = Signal()
self.i = i
self.o = o
self.reset = reset
if reset_value < 0:
reset_value = ratio + reset_value
cnt = Signal(max=ratio, reset=reset_value, name='{}_cnt'.format(name) if name is not None else None)
sd_clkdiv += If(reset, cnt.eq(0)).Else(cnt.eq(cnt + 1))
o_pre = Signal.like(self.o)
o_array = Array([o_pre[n*i_dw:(n+1)*i_dw] for n in range(ratio)])
sd_clkdiv += o_array[cnt].eq(self.i)
# we need to ensure that the last bit will be correct if clocks are phase aligned
o_pre_d = Signal.like(self.o)
sd_clk += o_pre_d.eq(o_pre)
sd_clk += self.o.eq(Cat(o_pre_d[:-1], o_pre[-1])) # would work as self.comb (at least in simulation)

View File

@ -10,7 +10,7 @@ from typing import Mapping, Sequence
from migen import *
from litedram.phy import dfi
from litedram.phy.lpddr4phy import SimulationPHY, Serializer, Deserializer
from litedram.phy.lpddr4.simphy import LPDDR4SimPHY, Serializer, Deserializer
from litex.gen.sim import run_simulation as _run_simulation
@ -422,7 +422,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_cs_phase_0(self):
# Test that CS is serialized correctly when sending command on phase 0
latency = '00000000' * self.CMD_LATENCY
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)}, # p0: READ
],
@ -434,7 +434,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_clk(self):
# Test clock serialization, first few cycles are undefined so ignore them
latency = 'xxxxxxxx' * self.CMD_LATENCY
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{3: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)},
],
@ -446,7 +446,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_cs_multiple_phases(self):
# Test that CS is serialized on different phases and that overlapping commands are handled
latency = '00000000' * self.CMD_LATENCY
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)},
{3: dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)},
@ -479,7 +479,7 @@ class TestLPDDR4(unittest.TestCase):
# Test proper serialization of commands to CA pads and that overlapping commands are handled
latency = '00000000' * self.CMD_LATENCY
read = dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: read, 3: read}, # p4 should be ignored
{0: read, 4: read},
@ -508,7 +508,7 @@ class TestLPDDR4(unittest.TestCase):
mrw = dict(cs_n=0, cas_n=0, ras_n=0, we_n=0, bank=0, address=(0b110011 << 8) | 0b10101010) # 6-bit address | 8-bit op code
zqc_start = dict(cs_n=0, cas_n=1, ras_n=1, we_n=0, bank=0, address=0b1001111) # MPC with ZQCAL START operand
zqc_latch = dict(cs_n=0, cas_n=1, ras_n=1, we_n=0, bank=0, address=0b1010001) # MPC with ZQCAL LATCH operand
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: read, 4: write_ap},
{0: activate, 4: refresh_ab},
@ -532,7 +532,7 @@ class TestLPDDR4(unittest.TestCase):
# Test serialization of DFI command pins (cs/cke/odt/reset_n)
latency = '00000000' * self.CMD_LATENCY
read = dict(cs_n=0, cas_n=0, ras_n=1, we_n=1)
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{
0: dict(cke=1, odt=1, reset_n=1, **read),
@ -552,7 +552,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_dq_out(self):
# Test serialization of dfi wrdata to DQ pads
dut = SimulationPHY()
dut = LPDDR4SimPHY()
zero = '00000000' * 2 # zero for 1 sysclk clock in sys8x_ddr clock domain
dfi_data = {
@ -576,7 +576,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_dq_only_1cycle(self):
# Test that DQ data is sent to pads only during expected cycle, on other cycles there is no data
dut = SimulationPHY()
dut = LPDDR4SimPHY()
zero = '00000000' * 2
dfi_data = {
@ -603,7 +603,7 @@ class TestLPDDR4(unittest.TestCase):
# Test serialization of DQS pattern in relation to DQ data, with proper preamble and postamble
zero = '00000000' * 2
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: dict(wrdata_en=1)},
{},
@ -634,7 +634,7 @@ class TestLPDDR4(unittest.TestCase):
# Test proper output on DMI pads. We don't implement masking now, so nothing should be sent to DMI pads
zero = '00000000' * 2
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = [
{0: dict(wrdata_en=1)},
{},
@ -670,7 +670,7 @@ class TestLPDDR4(unittest.TestCase):
{},
]
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = dfi_sequence,
pad_checkers = {},
pad_generators = {},
@ -710,7 +710,7 @@ class TestLPDDR4(unittest.TestCase):
{},
]
self.run_test(SimulationPHY(),
self.run_test(LPDDR4SimPHY(),
dfi_sequence = dfi_sequence,
pad_checkers = {},
pad_generators = {
@ -720,7 +720,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_cmd_write(self):
# Test whole WRITE command sequence verifying data on pads and write_latency from MC perspective
phy = SimulationPHY()
phy = LPDDR4SimPHY()
zero = '00000000' * 2
write_latency = phy.settings.write_latency
wrphase = phy.settings.wrphase.reset.value
@ -770,7 +770,7 @@ class TestLPDDR4(unittest.TestCase):
def test_lpddr4_cmd_read(self):
# Test whole READ command sequence simulating DRAM response and verifying read_latency from MC perspective
phy = SimulationPHY()
phy = LPDDR4SimPHY()
zero = '00000000' * 2
read_latency = phy.settings.read_latency
rdphase = phy.settings.rdphase.reset.value