From cc6ed667dfc28d7e869f011f92ee286ee944a019 Mon Sep 17 00:00:00 2001 From: bunnie Date: Thu, 6 Feb 2020 17:25:00 +0800 Subject: [PATCH] Request to merge I2S and SPIOPI cores --- litex/soc/cores/i2s.py | 435 +++++++++++++++ litex/soc/cores/spiopi.py | 1080 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1515 insertions(+) create mode 100644 litex/soc/cores/i2s.py create mode 100644 litex/soc/cores/spiopi.py diff --git a/litex/soc/cores/i2s.py b/litex/soc/cores/i2s.py new file mode 100644 index 000000000..1125f7920 --- /dev/null +++ b/litex/soc/cores/i2s.py @@ -0,0 +1,435 @@ +# This file is Copyright (c) 2020 bunnie +# License: BSD + +from litex.soc.interconnect.csr_eventmanager import * +from litex.soc.interconnect import wishbone +from litex.soc.integration.doc import AutoDoc, ModuleDoc +from migen.genlib.cdc import MultiReg + +class i2s_slave(Module, AutoCSR, AutoDoc): + def __init__(self, pads, fifodepth=256): + self.intro = ModuleDoc(""" + Intro + ******* + + I2S slave creates a slave audio interface instance. Tx and Rx interfaces are inferred + based upon the presence or absence of the respective pins in the "pads" argument. + + The interface is I2S-like, but note the deviation that the bits are justified + left without a 1-bit pad after sync edges. This isn't a problem for talking to the LM49352 + codec this was designed for, as the bit offset is programmable, but this will not work well if + are talking to a CODEC without a programmable bit offset! + + System Interface + ================= + + Audio interchange is done with the system using 16-bit stereo samples, with the right channel + mapped to the least significant word of a 32-bit word. Thus each 32-bit word is a single + stereo sample. As this is a slave I2S interface, sampling rate and framing is set by the programming + of the audio CODEC chip. A slave situation is preferred because this defers the generation of + audio clocks to the CODEC, which has PLLs specialized to generate the correct frequencies for + audio sampling rates. + + `fifodepth` is the depth at which either a read interrupt is fired (guaranteeing at least `fifodepth` + stereo samples in the receive FIFO) or a write interrupt is fired (guaranteeing at least `fifodepth` + free space in the transmit FIFO). The maximum depth is 512. + + To receive audio data: + + - reset the Rx FIFO, to guarantee all pointers at zero + - hook the Rx full interrupt with an interrupt handler (optional) + - if the CODEC is not yet transmitting data, initiate data transmission + - enable Rx FIFO to run + - poll or wait for interrupt; upon interrupt, read `fifodepth` words. Repeat. + - to close the stream, simply clear the Rx FIFO enable bit. The next initiation should call a + reset of the FIFO to ensure leftover previous data is cleared from the FIFO. + + To transmit audio data: + + - reset the Tx FIFO, to guarantee all pointers at zero + - hook the Tx available interrupt with an interrupt handler (optional) + - write 512 words of data into the Tx FIFO, filling it to the max + - if the CODEC is not yet requesting data and unmuted, unmute and initiate reception + - enable the Tx FIFO to run + - poll or wait for interrupt; upon interrupt, write `fifodepth` words. Repeat. + - to close stream, mute the DAC and stop the request clock. Ideally, this can be completed before + the FIFO is emptied, so there is no jarring pop or truncation of data + - stop FIFO running. Next initiation should reset the FIFO to ensure leftover previous data in + FIFO is cleared. + + CODEC Interface + ================ + + The interface assumes we have a sysclk domain running around 100MHz, and that our typical + max audio rate is 44.1kHz * 24bits * 2channels = 2.1168MHz audio clock. Thus, the architecture + treats the audio clock and data as asynchronous inputs that are MultiReg-syncd into the clock + domain. Probably the slowest sysclk rate this might work with is around 20-25MHz (10x over + sampling), but at 100MHz things will be quite comfortable. + + The upside of the fully asynchronous implementation is that we can leave the I/O unconstrained, + giving the place/route more latitude to do its job. + + Here's the timing format targeted by this I2S interface: + + .. wavedrom:: + :caption: Timing format of the I2S interface + + { "signal" : [ + { "name": "clk", "wave": "n....|.......|......" }, + { "name": "sync", "wave": "1.0..|....1..|....0." }, + { "name": "tx/rx", "wave": ".====|==x.===|==x.=x", "data": ["L15", "L14", "...", "L1", "L0", "R15", "R14", "...", "R1", "R0", "L15"] }, + ]} + + - Data is updated on the falling edge + - Data is sampled on the rising edge + - Words are MSB-to-LSB, left-justified (**NOTE: this is a deviation from strict I2S, which offsets by 1 from the left**) + - Sync is an input (FPGA is slave, codec is master): low => left channel, high => right channel + - Sync can be longer than the wordlen, extra bits are just ignored + - Tx is data to the codec (SDI pin on LM49352) + - Rx is data from the codec (SDO pin on LM49352) + + """) + + # one cache line is 8 32-bit words, need to always have enough space for one line or else nothing works + if fifodepth > 504: + fifodepth = 504 + print("I2S warning: fifo depth greater than 504 selected; truncating to 504") + if fifodepth < 8: + fifodepth = 8 + print("I2S warning: fifo depth less than 8 selected; truncating to 8") + + # connect pins, synchronizers, and edge detectors + if hasattr(pads, 'tx'): + tx_pin = Signal() + self.comb += pads.tx.eq(tx_pin) + if hasattr(pads, 'rx'): + rx_pin = Signal() + self.specials += MultiReg(pads.rx, rx_pin) + sync_pin = Signal() + self.specials += MultiReg(pads.sync, sync_pin) + + clk_pin = Signal() + self.specials += MultiReg(pads.clk, clk_pin) + clk_d = Signal() + self.sync += clk_d.eq(clk_pin) + rising_edge = Signal() + falling_edge = Signal() + self.comb += [rising_edge.eq(clk_pin & ~clk_d), falling_edge.eq(~clk_pin & clk_d)] + + # wishbone bus + self.bus = wishbone.Interface() + rd_ack = Signal() + wr_ack = Signal() + self.comb +=[ + If(self.bus.we, + self.bus.ack.eq(wr_ack), + ).Else( + self.bus.ack.eq(rd_ack), + ) + ] + + # interrupts + self.submodules.ev = EventManager() + if hasattr(pads, 'rx'): + self.ev.rx_ready = EventSourcePulse() # rising edge triggered, indicates FIFO is ready to read + self.ev.rx_error = EventSourcePulse() # indicates a rx error has happened + if hasattr(pads, 'tx'): + self.ev.tx_ready = EventSourcePulse() # indicates space available in Tx buffer for next quantum + self.ev.tx_error = EventSourcePulse() # indicates a tx error has happened + self.ev.finalize() + + + # build the RX subsystem + if hasattr(pads, 'rx'): + rx_rd_d = Signal(32) + rx_almostfull = Signal() + rx_almostempty = Signal() + rx_full = Signal() + rx_empty = Signal() + rx_rdcount = Signal(9) + rx_rderr = Signal() + rx_wrerr = Signal() + rx_wrcount = Signal(9) + rx_rden = Signal() + rx_wr_d = Signal(32) + rx_wren = Signal() + + self.rx_ctl = CSRStorage(description="Rx data path control", + fields=[ + CSRField("enable", size=1, description="Enable the receiving data"), + CSRField("reset", size=1, description="Writing `1` resets the FIFO. Reset happens regardless of enable state.", pulse=1) + ]) + self.rx_stat = CSRStatus(description="Rx data path status", + fields=[ + CSRField("overflow", size=1, description="Rx overflow"), + CSRField("underflow", size=1, description="Rx underflow"), + CSRField("dataready", size=1, description="{} words of data loaded and ready to read".format(fifodepth)), + CSRField("empty", size=1, description="No data available in FIFO to read"), # next flags probably never used + CSRField("wrcount", size=9, description="Write count"), + CSRField("rdcount", size=9, description="Read count"), + CSRField("fifodepth", size=9, description="FIFO depth as synthesized") + ]) + self.comb += self.rx_stat.fields.fifodepth.eq(fifodepth) + + rx_rst_cnt = Signal(3) + rx_reset = Signal() + self.sync += [ + If(self.rx_ctl.fields.reset, + rx_rst_cnt.eq(5), # 5 cycles reset required by design + rx_reset.eq(1) + ).Else( + If(rx_rst_cnt == 0, + rx_reset.eq(0) + ).Else( + rx_rst_cnt.eq(rx_rst_cnt - 1), + rx_reset.eq(1) + ) + ) + ] + # at a width of 32 bits, an 18kiB fifo is 512 entries deep + self.specials += Instance("FIFO_SYNC_MACRO", + p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, + p_ALMOST_EMPTY_OFFSET=8, p_ALMOST_FULL_OFFSET=(512 - fifodepth), + p_DO_REG=0, + + o_ALMOSTFULL=rx_almostfull, o_ALMOSTEMPTY=rx_almostempty, + o_DO=rx_rd_d, o_EMPTY=rx_empty, o_FULL=rx_full, + o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr, + i_DI=rx_wr_d, i_CLK=ClockSignal(), i_RDEN=rx_rden & ~rx_reset, + i_WREN=rx_wren & ~rx_reset, i_RST=rx_reset, + ) + self.comb += [ # wire up the status signals and interrupts + self.rx_stat.fields.overflow.eq(rx_wrerr), + self.rx_stat.fields.underflow.eq(rx_rderr), + self.rx_stat.fields.dataready.eq(rx_almostfull), + self.rx_stat.fields.wrcount.eq(rx_wrcount), + self.rx_stat.fields.rdcount.eq(rx_rdcount), + self.ev.rx_ready.trigger.eq(rx_almostfull), + self.ev.rx_error.trigger.eq(rx_wrerr | rx_rderr), + ] + bus_read = Signal() + bus_read_d = Signal() + rd_ack_pipe = Signal() + self.comb += bus_read.eq(self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 0)) + self.sync += [ # this is the bus responder -- only works for uncached memory regions + bus_read_d.eq(bus_read), + If(bus_read & ~bus_read_d, # one response, one cycle + rd_ack_pipe.eq(1), + If(~rx_empty, + self.bus.dat_r.eq(rx_rd_d), + rx_rden.eq(1), + ).Else( + self.bus.dat_r.eq(0xDEADBEEF), # don't stall the bus indefinitely if we try to read from an empty fifo...just return garbage + rx_rden.eq(0), + ) + ).Else( + rx_rden.eq(0), + rd_ack_pipe.eq(0), + ), + rd_ack.eq(rd_ack_pipe), + ] + + rx_cnt = Signal(5) + self.submodules.rxi2s = rxi2s = FSM(reset_state="IDLE") + rxi2s.act("IDLE", + NextValue(rx_wr_d, 0), + If(self.rx_ctl.fields.enable, + If(rising_edge & sync_pin, # wait_sync guarantees we start at the beginning of a left frame, and not in the middle + NextState("WAIT_SYNC"), + ) + ) + ), + rxi2s.act("WAIT_SYNC", + If(rising_edge & ~sync_pin, + NextState("LEFT"), + NextValue(rx_cnt, 16), + ), + ) + rxi2s.act("LEFT", + If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else( + NextValue(rx_wr_d, Cat(rx_pin, rx_wr_d[:-1])), + NextValue(rx_cnt, rx_cnt - 1), + NextState("LEFT_WAIT"), + ) + ) + rxi2s.act("LEFT_WAIT", + If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else( + If(rising_edge, + If((rx_cnt == 0) & sync_pin, + NextValue(rx_cnt, 16), + NextState("RIGHT") + ).Elif(rx_cnt > 0, + NextState("LEFT"), + ) + ) + ) + ) + rxi2s.act("RIGHT", + If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else( + NextValue(rx_wr_d, Cat(rx_pin, rx_wr_d[:-1])), + NextValue(rx_cnt, rx_cnt - 1), + NextState("RIGHT_WAIT"), + ) + ) + rxi2s.act("RIGHT_WAIT", + If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else( + If(rising_edge, + If((rx_cnt == 0) & ~sync_pin, + NextValue(rx_cnt, 16), + NextState("LEFT"), + rx_wren.eq(1), # pulse rx_wren to write the current data word + ).Elif(rx_cnt > 0, + NextState("RIGHT"), + ) + ) + ) + ) + + + # build the TX subsystem + if hasattr(pads, 'tx'): + tx_rd_d = Signal(32) + tx_almostfull = Signal() + tx_almostempty = Signal() + tx_full = Signal() + tx_empty = Signal() + tx_rdcount = Signal(9) + tx_rderr = Signal() + tx_wrerr = Signal() + tx_wrcount = Signal(9) + tx_rden = Signal() + tx_wr_d = Signal(32) + tx_wren = Signal() + + self.tx_ctl = CSRStorage(description="Tx data path control", + fields=[ + CSRField("enable", size=1, description="Enable the transmission data"), + CSRField("reset", size=1, description="Writing `1` resets the FIFO. Reset happens regardless of enable state.", pulse=1) + ]) + self.tx_stat = CSRStatus(description="Tx data path status", + fields=[ + CSRField("overflow", size=1, description="Tx overflow"), + CSRField("underflow", size=1, description="Tx underflow"), + CSRField("free", size=1, description="At least {} words of space free".format(fifodepth)), + CSRField("almostfull", size=1, description="Less than 8 words space available"), # the next few flags should be rarely used + CSRField("full", size=1, description="FIFO is full or overfull"), + CSRField("empty", size=1, description="FIFO is empty"), + CSRField("wrcount", size=9, description="Tx write count"), + CSRField("rdcount", size=9, description="Tx read count"), + ]) + + tx_rst_cnt = Signal(3) + tx_reset = Signal() + self.sync += [ + If(self.tx_ctl.fields.reset, + tx_rst_cnt.eq(5), # 5 cycles reset required by design + tx_reset.eq(1) + ).Else( + If(tx_rst_cnt == 0, + tx_reset.eq(0) + ).Else( + tx_rst_cnt.eq(tx_rst_cnt - 1), + tx_reset.eq(1) + ) + ) + ] + # at a width of 32 bits, an 18kiB fifo is 512 entries deep + self.specials += Instance("FIFO_SYNC_MACRO", + p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, + p_ALMOST_EMPTY_OFFSET=fifodepth, p_ALMOST_FULL_OFFSET=8, + p_DO_REG=0, + + o_ALMOSTFULL=tx_almostfull, o_ALMOSTEMPTY=tx_almostempty, + o_DO=tx_rd_d, o_EMPTY=tx_empty, o_FULL=tx_full, + o_RDCOUNT=tx_rdcount, o_RDERR=tx_rderr, o_WRCOUNT=tx_wrcount, o_WRERR=tx_wrerr, + i_DI=tx_wr_d, i_CLK=ClockSignal(), i_RDEN=tx_rden & ~tx_reset, + i_WREN=tx_wren & ~tx_reset, i_RST=tx_reset, + ) + self.comb += [ # wire up the status signals and interrupts + self.tx_stat.fields.overflow.eq(tx_wrerr), + self.tx_stat.fields.underflow.eq(tx_rderr), + self.tx_stat.fields.free.eq(tx_almostempty), + self.tx_stat.fields.almostfull.eq(tx_almostfull), + self.tx_stat.fields.full.eq(tx_full), + self.tx_stat.fields.empty.eq(tx_empty), + self.tx_stat.fields.rdcount.eq(tx_rdcount), + self.tx_stat.fields.wrcount.eq(tx_wrcount), + self.ev.tx_ready.trigger.eq(tx_almostempty), + self.ev.tx_error.trigger.eq(tx_wrerr | tx_rderr), + ] + self.sync += [ # this is the bus responder -- need to check how this interacts with uncached memory region + If(self.bus.cyc & self.bus.stb & self.bus.we & ~self.bus.ack, + If(~tx_full, + tx_wr_d.eq(self.bus.dat_w), + tx_wren.eq(1), + wr_ack.eq(1), + ).Else( + tx_wren.eq(0), + wr_ack.eq(0), + ) + ).Else( + tx_wren.eq(0), + wr_ack.eq(0), + ) + ] + + tx_cnt = Signal(5) + tx_buf = Signal(32) + self.submodules.txi2s = txi2s = FSM(reset_state="IDLE") + txi2s.act("IDLE", + If(self.tx_ctl.fields.enable, + If(falling_edge & sync_pin, + NextState("WAIT_SYNC"), + ) + ) + ), + txi2s.act("WAIT_SYNC", + If(falling_edge & ~sync_pin, + NextState("LEFT"), + NextValue(tx_cnt, 16), + NextValue(tx_buf, tx_rd_d), + tx_rden.eq(1), + ), + ) + txi2s.act("LEFT", + If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else( + NextValue(tx_pin, tx_buf[31]), + NextValue(tx_buf, Cat(0, tx_buf[:-1])), + NextValue(tx_cnt, tx_cnt - 1), + NextState("LEFT_WAIT"), + ) + ) + txi2s.act("LEFT_WAIT", + If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else( + If(falling_edge, + If((tx_cnt == 0) & sync_pin, + NextValue(tx_cnt, 16), + NextState("RIGHT") + ).Elif(tx_cnt > 0, + NextState("LEFT"), + ) + ) + ) + ) + txi2s.act("RIGHT", + If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else( + NextValue(tx_pin, tx_buf[31]), + NextValue(tx_buf, Cat(0, tx_buf[:-1])), + NextValue(tx_cnt, tx_cnt - 1), + NextState("RIGHT_WAIT"), + ) + ) + txi2s.act("RIGHT_WAIT", + If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else( + If(falling_edge, + If((tx_cnt == 0) & ~sync_pin, + NextValue(tx_cnt, 16), + NextState("LEFT"), + NextValue(tx_buf, tx_rd_d), + tx_rden.eq(1), + ).Elif(tx_cnt > 0, + NextState("RIGHT"), + ) + ) + ) + ) \ No newline at end of file diff --git a/litex/soc/cores/spiopi.py b/litex/soc/cores/spiopi.py new file mode 100644 index 000000000..941d65c62 --- /dev/null +++ b/litex/soc/cores/spiopi.py @@ -0,0 +1,1080 @@ +# This file is Copyright (c) 2020 bunnie +# License: BSD + +from litex.soc.interconnect.csr_eventmanager import * +from litex.soc.interconnect import wishbone +from litex.soc.integration.doc import AutoDoc, ModuleDoc +from migen.genlib.cdc import MultiReg + +class SpiOpi(Module, AutoCSR, AutoDoc): + def __init__(self, pads, dq_delay_taps=31, sclk_name="SCLK_ODDR", + iddr_name="SPI_IDDR", miso_name="MISO_FDRE", sim=False, spiread=False, prefetch_lines=1): + self.intro = ModuleDoc(""" + Intro + ******** + + SpiOpi implements a dual-mode SPI or OPI interface. OPI is an octal (8-bit) wide + variant of SPI, which is unique to Macronix parts. It is concurrently interoperable + with SPI. The chip supports "DTR mode" (double transfer rate, e.g. DDR) where data + is transferred on each edge of the clock, and there is a source-synchronous DQS + associated with the input data. + + The chip by default boots into SPI-only mode (unless NV bits are burned otherwise) + so to enable OPI, a config register needs to be written with SPI mode. Note that once + the config register is written, the only way to return to SPI mode is to change + it with OPI writes, or to issue a hardware reset. This has major implications for + reconfiguring the FPGA: a simple JTAG command to reload from SPI will not yank PROG_B low, + and so the SPI ROM will be in DOPI, and SPI loading will fail. Thus, system architects + must take into consideration a hard reset for the ROM whenever a bitstream reload + is demanded of the FPGA. + + The SpiOpi architecture is split into two levels: a command manager, and a + cycle manager. The command manager is responsible for taking the current wishbone + request and CSR state and unpacking these into cycle-by-cycle requests. The cycle + manager is responsible for coordinating the cycle-by-cycle requests. + + In SPI mode, this means marshalling byte-wide requests into a series of 8 serial cyles. + + In OPI [DOPI] mode, this means marshalling 16-bit wide requests into a pair of back-to-back DDR + cycles. Note that because the cycles are DDR, this means one 16-bit wide request must be + issued every cycle to keep up with the interface. + + For the output of data to ROM, expects a clock called "spinor_delayed" which is a delayed + version of "sys". The delay is necessary to get the correct phase relationship between + the SIO and SCLK in DTR/DDR mode, and it also has to compensate for the special-case + difference in the CCLK pad vs other I/O. + + For the input, DQS signal is independently delayed relative to the DQ signals using + an IDELAYE2 block. At a REFCLK frequency of 200 MHz, each delay tap adds 78ps, so up + to a 2.418ns delay is possible between DQS and DQ. The goal is to delay DQS relative + to DQ, because the SPI chip launches both with concurrent rising edges (to within 0.6ns), + but the IDDR register needs the rising edge of DQS to be centered inside the DQ eye. + + In DOPI mode, there is a prefetch buffer. It will read `prefetch_lines` cache lines of + data into the prefetch buffer. A cache line is 256 bits (or 8x32-bit words). The maximum + value is 63 lines (one line is necessary for synchronization margin). The downside of + setting prefetch_lines high is that the prefetcher is running constantly and burning + power, while throwing away most data. In practice, the CPU will typically consume data + at only slightly faster than the rate of read-out from DOPI-mode ROM, and once data + is consumed the prefetch resumes. Thus, prefetch_lines is probably optimally around + 1-3 lines read-ahead of the CPU. Any higher than 3 lines probably just wastes power. + In short simulations, 1 line of prefetch seems to be enough to keep the prefetcher + ahead of the CPU even when it's simply running straight-line code. + + Note the "sim" parameter exists because there seems to be a bug in xvlog that doesn't + correctly simulate the IDELAY machines. Setting "sim" to True removes the IDELAY machines + and passes the data through directly, but in real hardware the IDELAY machines are + necessary to meet timing between DQS and DQ. + + dq_delay_taps probably doesn't need to be adjusted; it can be tweaked for timing + closure. The delays can also be adjusted at runtime. + """) + if prefetch_lines > 63: + prefetch_lines = 63 + + self.spi_mode = Signal(reset=1) # when reset is asserted, force into spi mode + cs_n = Signal(reset=1) # make sure CS is sane on reset, too + + self.config = CSRStorage(fields=[ + CSRField("dummy", size=5, description="Number of dummy cycles", reset=10), + ]) + + delay_type = "VAR_LOAD" + + # DQS input conditioning ----------------------------------------------------------------- + dqs_iobuf = Signal() + self.clock_domains.cd_dqs = ClockDomain(reset_less=True) + self.comb += self.cd_dqs.clk.eq(dqs_iobuf) + self.specials += [ + Instance("BUFR", i_I=pads.dqs, o_O=dqs_iobuf), + ] + + # DQ connections ------------------------------------------------------------------------- + # PHY API + self.do = Signal(16) # OPI data to SPI + self.di = Signal(16) # OPI data from SPI + self.tx = Signal() # when asserted OPI is transmitting data to SPI, otherwise, receiving + + self.mosi = Signal() # SPI data to SPI + self.miso = Signal() # SPI data from SPI + + # Delay programming API + self.delay_config = CSRStorage(fields=[ + CSRField("d", size=5, description="Delay amount; each increment is 78ps", reset=31), + CSRField("load", size=1, description="Force delay taps to delay_d"), + ]) + self.delay_status = CSRStatus(fields=[ + CSRField("q", size=5, description="Readback of current delay amount, useful if inc/ce is used to set"), + ]) + self.delay_update = Signal() + self.hw_delay_load = Signal() + self.sync += self.delay_update.eq(self.hw_delay_load | self.delay_config.fields.load) + + # Break system API into rising/falling edge samples + do_rise = Signal(8) # data output presented on the rising edge + do_fall = Signal(8) # data output presented on the falling edge + self.comb += [do_rise.eq(self.do[8:]), do_fall.eq(self.do[:8])] + + di_rise = Signal(8) + di_fall = Signal(8) + self.comb += self.di.eq(Cat(di_fall, di_rise)) + + # OPI DDR registers + dq = TSTriple(7) # dq[0] is special because it is also MOSI + dq_delayed = Signal(8) + self.specials += dq.get_tristate(pads.dq[1:]) + for i in range(1, 8): + self.specials += Instance("ODDR", + p_DDR_CLK_EDGE="SAME_EDGE", + i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1, + i_D1=do_rise[i], i_D2=do_fall[i], o_Q=dq.o[i - 1], + ) + if sim == False: + if i == 1: # only wire up o_CNTVALUEOUT for one instance + self.specials += Instance("IDELAYE2", + p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA", + p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", + p_REFCLK_FREQUENCY=200.0, + p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, + p_IDELAY_TYPE=delay_type, + + i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, + i_CE=0, + i_LD=self.delay_update, + i_CNTVALUEIN=self.delay_config.fields.d, + o_CNTVALUEOUT=self.delay_status.fields.q, + i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i], + ), + else: # don't wire up o_CNTVALUEOUT for others + self.specials += Instance("IDELAYE2", + p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA", + p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", + p_REFCLK_FREQUENCY=200.0, + p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, + p_IDELAY_TYPE=delay_type, + + i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, + i_CE=0, + i_LD=self.delay_update, + i_CNTVALUEIN=self.delay_config.fields.d, + i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i], + ), + else: + self.comb += dq_delayed[i].eq(dq.i[i - 1]) + self.specials += Instance("IDDR", name="{}{}".format(iddr_name, str(i)), + p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED", + i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1, + i_D=dq_delayed[i], o_Q1=di_rise[i], o_Q2=di_fall[i], + ) + # SPI SDR register + self.specials += [ + Instance("FDRE", name="{}".format(miso_name), i_C=~ClockSignal("spinor"), i_CE=1, i_R=0, o_Q=self.miso, + i_D=dq_delayed[1], + ) + ] + + # bit 0 (MOSI) is special-cased to handle SPI mode + dq_mosi = TSTriple(1) # this has similar structure but an independent "oe" signal + self.specials += dq_mosi.get_tristate(pads.dq[0]) + do_mux_rise = Signal() # mux signal for mosi/dq select of bit 0 + do_mux_fall = Signal() + self.specials += [ + Instance("ODDR", + p_DDR_CLK_EDGE="SAME_EDGE", + i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1, + i_D1=do_mux_rise, i_D2=do_mux_fall, o_Q=dq_mosi.o, + ), + Instance("IDDR", + p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED", + i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1, + o_Q1=di_rise[0], o_Q2=di_fall[0], i_D=dq_delayed[0], + ), + ] + if sim == False: + self.specials += [ + Instance("IDELAYE2", + p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA", + p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0, + p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type, + + i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0, + i_LD=self.delay_update, + i_CNTVALUEIN=self.delay_config.fields.d, + i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0], + ), + ] + else: + self.comb += dq_delayed[0].eq(dq_mosi.i) + + # wire up SCLK interface + clk_en = Signal() + self.specials += [ + # de-activate the CCLK interface, parallel it with a GPIO + Instance("STARTUPE2", + i_CLK=0, i_GSR=0, i_GTS=0, i_KEYCLEARB=0, i_PACK=0, i_USRDONEO=1, i_USRDONETS=1, + i_USRCCLKO=0, i_USRCCLKTS=1, # force to tristate + ), + Instance("ODDR", name=sclk_name, # need to name this so we can constrain it properly + p_DDR_CLK_EDGE="SAME_EDGE", + i_C=ClockSignal("spinor"), i_R=ResetSignal("spinor"), i_S=0, i_CE=1, + i_D1=clk_en, i_D2=0, o_Q=pads.sclk, + ) + ] + + # wire up CS_N + spi_cs_n = Signal() + opi_cs_n = Signal() + self.comb += cs_n.eq((self.spi_mode & spi_cs_n) | (~self.spi_mode & opi_cs_n)) + self.specials += [ + Instance("ODDR", + p_DDR_CLK_EDGE="SAME_EDGE", + i_C=ClockSignal(), i_R=0, i_S=ResetSignal(), i_CE=1, + i_D1=cs_n, i_D2=cs_n, o_Q=pads.cs_n, + ), + ] + + self.architecture = ModuleDoc(""" + Architecture + ************** + + The machine is split into two separate pieces, one to handle SPI, and one to handle OPI. + + SPI + ===== + The SPI machine architecture is split into two levels: MAC and PHY. + + The MAC layer is responsible for: + - receiving requests via CSR register to perform config/status/special command sequences, + and dispatching these to the SPI PHY + - translating wishbone bus requests into command sequences, and routing them to either OPI + or SPI PHY. + - managing the chip select to the chip, and ensuring that one dummy cycle is inserted after + chip select is asserted, or before it is de-asserted; and that the chip select "high" times + are adequate (1 cycle between reads, 4 cycles for all other operations) + + On boot, the interface runs in SPI; once the wakeup sequence is executed, the chip permanently + switches to OPI mode unless the CR2 registers are written to fall back, or the + reset to the chip is asserted. + + The PHY layers are responsible for the following tasks: + - Serializing and deserializing data, standardized on 8 bits for SPI and 16 bits for OPI + - counting dummy cycles + - managing the clock enable + + PHY cycles are initiated with a "req" signal, which is only sampled for + one cycle and then ignored until the PHY issues an "ack" that the current cycle is complete. + Thus holding "req" high can allow the PHY to back-to-back issue cycles without pause. + + OPI + ===== + The OPI machine is split into three parts: a command controller, a Tx PHY, and an Rx PHY. + + The Tx PHY is configured with a "dummy cycle" count register, as there is a variable length + delay for dummy cycles in OPI. + + In OPI mode, read data is `mesochronous`, that is, they return at precisely the same frequency + as SCLK, but with an unknown phase relationship. The DQS strobe is provided as a "hint" to + the receiving side to help retime the data. The mesochronous nature of the read data is + why the Tx and Rx PHY must be split into two separate machines, as they are operating in + different clock domains. + + DQS is implemented on the ROM as an extra data output that is guaranteed to change polarity with + each data byte; the skew mismatch of DQS to data is within +/-0.6ns or so. It turns out the mere + act of routing the DQS into a BUFR buffer before clocking the data into an IDDR primitive + is sufficient to delay the DQS signal and meet setup and hold time on the IDDR. + + Once captured by the IDDR, the data is fed into a dual-clock FIFO to make the transition + from the DQS to sysclk domains cleanly. + + Because of the latency involved in going from pin->IDDR->FIFO, excess read cycles are + required beyond the end of the requested cache line. However, there is virtually no + penalty in pre-filling the FIFO with data; if a new cache line has to be fetched, + the FIFO can simply be reset and all pointers zeroed. In fact, pre-filling the FIFO + can lead to great performance benefits if sequential cache lines are requested. In + simulation, a cache line can be filled in 10 bus cycles if it happens to be prefetched + (as opposed to 49 bus cycles for random reads). Either way, this compares favorably to + 288 cycles for random reads in 100MHz SPI mode (or 576 for the spimemio.v, which runs at + 50MHz). + + The command controller is repsonsible for sequencing all commands other than fast reads. Most + commands have some special-case structure to them, and as more commands are implemented, the + state machine is expected to grow fairly large. Fast reads are directly handled in "tx_run" + mode, where the TxPhy and RxPhy run a tight loop to watch incoming read bus cycles, check + the current address, fill the prefetch fifo, and respond to bus cycles. + + Writes to ROM might lock up the machine; a TODO is to test this and do something more sane, + like ignore writes by sending an ACK immediately while discarding the data. + + Thus, an OPI read proceeds as follows: + + - When BUS/STB are asserted: + TxPhy: + + - capture bus_adr, and compare against the *next read* address pointer + - if they match, allow the PHYs to do the work + + - if bus_adr and next read address don't match, save to next read address pointer, and + cycle wr/rd clk for 5 cycle while asserting reset to reset the FIFO + - initiate an 8DTRD with the read address pointer + - wait the specified dummy cycles + + - greedily pre-fill the FIFO by continuing to clock DQS until either: + - the FIFO is full + - pre-fetch is aborted because bus_adr and next read address don't match and FIFO is reset + + RxPHY: + - while CTI==2, assemble data into 32-bit words as soon as EMPTY is deasserted, + present a bus_ack, and increment the next read address pointer + - when CTI==7, ack the data, and wait until the next bus cycle with CTI==2 to resume + reading + + - A FIFO_SYNC_MACRO is used to instantiate the FIFO. This is chosen because: + - we can specify RAMB18's, which seem to be under-utilized by the auto-inferred memories by migen + - the XPM_FIFO_ASYNC macro claims no instantiation support, and also looks like it has weird + requirements for resetting the pointers: you must check the reset outputs, and the time to + reset is reported to be as high as around 200ns (anecdotally -- could be just that the sim I + read on the web is using a really slow clock, but I'm guessing it's around 10 cycles). + - the FIFO_SYNC_MACRO has a well-specified fixed reset latency of 5 cycles. + - The main downside of FIFO_SYNC_MACRO over XPM_FIFO_ASYNC is that XPM_FIFO_ASYNC can automatically + allow for output data to be read at 32-bit widths, with writes at 16-bit widths. However, with a + bit of additional logic and pipelining, we can aggregate data into 32-bit words going into a + 32-bit FIFO_SYNC_MACRO, which is what we do in this implementation. + """) + self.bus = wishbone.Interface() + + self.command = CSRStorage( + description="Write individual bits to issue special commands to SPI; setting multiple bits at once leads to undefined behavior.", + fields=[ + CSRField("wakeup", size=1, description="Sequence through init & wakeup routine"), + CSRField("sector_erase", size=1, description="Erase a sector"), + ]) + self.sector = CSRStorage(description="Sector to erase", + fields=[ + CSRField("sector", size=32, description="Sector to erase") + ]) + self.status = CSRStatus(description="Interface status", + fields=[ + CSRField("wip", size=1, description="Operation in progress (write or erease)") + ]) + # TODO: implement ECC detailed register readback, CRC checking + + # PHY machine mux -------------------------------------------------------------------------- + # clk_en mux + spi_clk_en = Signal() + opi_clk_en = Signal() + self.sync += clk_en.eq(~self.spi_mode & opi_clk_en | self.spi_mode & spi_clk_en) + # tristate mux + self.sync += [ + dq.oe.eq(~self.spi_mode & self.tx), + dq_mosi.oe.eq(self.spi_mode | self.tx), + ] + # data out mux (no data in mux, as we can just sample data in all the time without harm) + self.comb += do_mux_rise.eq(~self.spi_mode & do_rise[0] | self.spi_mode & self.mosi) + self.comb += do_mux_fall.eq(~self.spi_mode & do_fall[0] | self.spi_mode & self.mosi) + + has_dummy = Signal() # indicates if the current "req" requires dummy cycles to be appended (used for both OPI/SPI) + rom_addr = Signal(32, + reset=0xFFFFFFFC) # location of the internal ROM address pointer; reset to invalid address to force an address request on first read + + # MAC/PHY abstraction for OPI + txphy_do = Signal(16) # two sources of data out for OPI, one from the PHY, one from MAC + txcmd_do = Signal(16) + opi_di = Signal(16) + + # internal machines + opi_addr = Signal(32) + opi_fifo_rd = Signal(32) + opi_fifo_wd = Signal(32) + opi_reset_rx_req = Signal() + opi_reset_rx_ack = Signal() + opi_rx_run = Signal() + + rx_almostempty = Signal() + rx_almostfull = Signal() + rx_empty = Signal() + rx_full = Signal() + rx_rdcount = Signal(9) + rx_rderr = Signal() + rx_wrcount = Signal(9) + rx_wrerr = Signal() + rx_rden = Signal() + rx_wren = Signal(reset=1) + rx_fifo_rst = Signal() + + wrendiv = Signal() + wrendiv2 = Signal() + self.specials += [ + # this next pair of async-clear flip flops creates a write-enable gate that (a) ignores the first + # two DQS strobes (as they are pipe-filling) and (b) alternates with the correct phase so we are + # sampling 32-bit data into the FIFO. + Instance("FDCE", name="FDCE_WREN", + i_C=dqs_iobuf, i_D=~wrendiv, o_Q=wrendiv, i_CE=1, i_CLR=~rx_wren, + ), + Instance("FDCE", name="FDCE_WREN", + i_C=dqs_iobuf, i_D=~wrendiv2, o_Q=wrendiv2, i_CE=wrendiv & ~wrendiv2, i_CLR=~rx_wren, + ), + # Direct FIFO primitive is more resource-efficient and faster than migen primitive. + Instance("FIFO_DUALCLOCK_MACRO", + p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, p_FIRST_WORD_FALL_THROUGH="TRUE", + p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512 - (8 * prefetch_lines)), + + o_ALMOSTEMPTY=rx_almostempty, o_ALMOSTFULL=rx_almostfull, + o_DO=opi_fifo_rd, o_EMPTY=rx_empty, o_FULL=rx_full, + o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr, + i_DI=opi_fifo_wd, i_RDCLK=ClockSignal(), i_RDEN=rx_rden, + i_WRCLK=dqs_iobuf, i_WREN=wrendiv & wrendiv2, i_RST=rx_fifo_rst, + ) + ] + self.sync.dqs += opi_di.eq(self.di) + self.comb += opi_fifo_wd.eq(Cat(opi_di, self.di)) + + # --------- OPI Rx Phy machine ------------------------------ + self.submodules.rxphy = rxphy = FSM(reset_state="IDLE") + cti_pipe = Signal(3) + rxphy_cnt = Signal(3) + rxphy.act("IDLE", + If(self.spi_mode, + NextState("IDLE"), + ).Else( + NextValue(self.bus.ack, 0), + If(opi_reset_rx_req, + NextState("WAIT_RESET"), + NextValue(rxphy_cnt, 6), + NextValue(rx_wren, 0), + NextValue(rx_fifo_rst, 1), + ).Elif(opi_rx_run, + NextValue(rx_wren, 1), + If((self.bus.cyc & self.bus.stb & ~self.bus.we) & ((self.bus.cti == 2) | + (( + self.bus.cti == 7) & ~self.bus.ack)), + # handle case of non-pipelined read, ack is late + If(~rx_empty, + NextValue(self.bus.dat_r, opi_fifo_rd), + rx_rden.eq(1), + NextValue(opi_addr, opi_addr + 4), + NextValue(self.bus.ack, 1), + ) + ) + ) + ) + ) + rxphy.act("WAIT_RESET", + NextValue(opi_addr, Cat(Signal(2), self.bus.adr)), + NextValue(rxphy_cnt, rxphy_cnt - 1), + If(rxphy_cnt == 0, + NextValue(rx_fifo_rst, 0), + opi_reset_rx_ack.eq(1), + NextState("IDLE"), + ) + ) + + # TxPHY machine: OPI ------------------------------------------------------------------------- + txphy_cnt = Signal(4) + tx_run = Signal() + txphy_cs_n = Signal(reset=1) + txcmd_cs_n = Signal(reset=1) + txphy_clken = Signal() + txcmd_clken = Signal() + txphy_oe = Signal() + txcmd_oe = Signal() + self.sync += opi_cs_n.eq((tx_run & txphy_cs_n) | (~tx_run & txcmd_cs_n)) + self.comb += If(tx_run, self.do.eq(txphy_do)).Else(self.do.eq(txcmd_do)) + self.comb += opi_clk_en.eq((tx_run & txphy_clken) | (~tx_run & txcmd_clken)) + self.comb += self.tx.eq((tx_run & txphy_oe) | (~tx_run & txcmd_oe)) + tx_almostfull = Signal() + self.sync += tx_almostfull.eq(rx_almostfull) # sync the rx_almostfull signal into the local clock domain + txphy_bus = Signal() + self.sync += txphy_bus.eq(self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2)) + tx_resetcycle = Signal() + + self.submodules.txphy = txphy = FSM(reset_state="RESET") + txphy.act("RESET", + NextValue(opi_rx_run, 0), + NextValue(txphy_oe, 0), + NextValue(txphy_cs_n, 1), + NextValue(txphy_clken, 0), + # guarantee that the first state we go to out of reset is a four-cycle burst + NextValue(txphy_cnt, 4), + If(tx_run & ~self.spi_mode, NextState("TX_SETUP")) + ) + txphy.act("TX_SETUP", + NextValue(opi_rx_run, 0), + NextValue(txphy_cnt, txphy_cnt - 1), + If(txphy_cnt > 0, + NextValue(txphy_cs_n, 1) + ).Else( + NextValue(txphy_cs_n, 0), + NextValue(txphy_oe, 1), + NextState("TX_CMD_CS_DELAY"), + ) + ) + txphy.act("TX_CMD_CS_DELAY", # meet setup timing for CS-to-clock + NextState("TX_CMD"), + ) + txphy.act("TX_CMD", + NextValue(txphy_do, 0xEE11), + NextValue(txphy_clken, 1), + NextState("TX_ADRHI"), + ) + txphy.act("TX_ADRHI", + NextValue(txphy_do, opi_addr[16:] & 0x07FF), # mask off unused bits + NextState("TX_ADRLO"), + ) + txphy.act("TX_ADRLO", + NextValue(txphy_do, opi_addr[:16]), + NextValue(txphy_cnt, self.config.fields.dummy - 1), + NextState("TX_DUMMY"), + ) + txphy.act("TX_DUMMY", + NextValue(txphy_oe, 0), + NextValue(txphy_do, 0), + NextValue(txphy_cnt, txphy_cnt - 1), + If(txphy_cnt == 0, + NextValue(opi_rx_run, 1), + If(tx_resetcycle, + NextValue(txphy_clken, 1), + NextValue(opi_reset_rx_req, 1), + NextState("TX_RESET_RX"), + ).Else( + NextState("TX_FILL"), + ) + ) + ) + txphy.act("TX_FILL", + If(tx_run, + If(((~txphy_bus & (self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))) & + (opi_addr[2:] != self.bus.adr)) | tx_resetcycle, + # it's a new bus cycle, and the requested address is not equal to the current read buffer address + NextValue(txphy_clken, 1), + NextValue(opi_reset_rx_req, 1), + NextState("TX_RESET_RX"), + ).Else( + If(tx_almostfull & ~self.bus.ack, + NextValue(txphy_clken, 0) + ).Else( + NextValue(txphy_clken, 1) + ) + ), + If(~(self.bus.cyc & self.bus.stb), + NextValue(opi_rx_run, 0), + ).Else( + NextValue(opi_rx_run, 1), + ) + ).Else( + NextValue(txphy_clken, 0), + NextState("RESET") + ) + ) + txphy.act("TX_RESET_RX", # keep clocking the RX until it acknowledges a reset + NextValue(opi_rx_run, 0), + NextValue(opi_reset_rx_req, 0), + If(opi_reset_rx_ack, + NextValue(txphy_clken, 0), + NextValue(txphy_cnt, 0), # 1 cycle CS on back-to-back reads + NextValue(txphy_cs_n, 1), + NextState("TX_SETUP"), + ).Else( + NextValue(txphy_clken, 1), + ) + ) + + # --------- OPI CMD machine ------------------------------ + self.submodules.opicmd = opicmd = FSM(reset_state="RESET") + opicmd.act("RESET", + NextValue(txcmd_do, 0), + NextValue(txcmd_oe, 0), + NextValue(tx_run, 0), + NextValue(txcmd_cs_n, 1), + If(~self.spi_mode, + NextState("IDLE"), + ).Else(NextState("RESET_CYCLE")), + ) + opicmd.act("RESET_CYCLE", + NextValue(txcmd_cs_n, 0), + If(opi_reset_rx_ack, + NextValue(tx_run, 1), + NextState("IDLE"), + ).Else( + NextValue(tx_run, 1), + tx_resetcycle.eq(1), + ) + ) + opicmd.act("IDLE", + NextValue(txcmd_cs_n, 1), + If(~self.spi_mode, # this machine stays in idle once spi_mode is dropped + ## The full form of this machine is as follows: + # - First check if there is a CSR special command pending + # - if so, wait until the current bus cycle is done, then de-assert tx_run + # - then run the command + # - Else wait until a bus cycle, and once it happens, put the system into run mode + If(self.bus.cyc & self.bus.stb, + If(~self.bus.we & (self.bus.cti == 2), + NextState("TX_RUN") + ).Else( + # handle other cases here, e.g. what do we do if we get a write? probably + # should just ACK it without doing anything so the CPU doesn't freeze... + ) + ).Elif(self.command.re, + NextState("DISPATCH_CMD"), + ) + ) + ) + opicmd.act("TX_RUN", + NextValue(tx_run, 1), + If(self.command.re, # respond to commands + NextState("WAIT_DISPATCH") + ) + ) + opicmd.act("WAIT_DISPATCH", # wait until the current cycle is done, then stop TX and dispatch command + If(~(self.bus.cyc & self.bus.stb), + NextValue(tx_run, 0), + NextState("DISPATCH_CMD") + ) + ) + opicmd.act("DISPATCH_CMD", + If(self.command.fields.sector_erase, + NextState("DO_SECTOR_ERASE"), + ).Else( + NextState("IDLE"), + ) + ) + opicmd.act("DO_SECTOR_ERASE", + # placeholder + ) + + # MAC/PHY abstraction for the SPI machine + spi_req = Signal() + spi_ack = Signal() + spi_do = Signal(8) # this is the API to the machine + spi_di = Signal(8) + + # PHY machine: SPI ------------------------------------------------------------------------- + + # internal signals are: + # selection - self.spi_mode + # OPI - self.do(16), self.di(16), self.tx + # SPI - self.mosi, self.miso + # cs_n - both + # ecs_n - OPI + # clk_en - both + + spicount = Signal(5) + spi_so = Signal(8) # this internal to the machine + spi_si = Signal(8) + spi_dummy = Signal() + spi_di_load = Signal() # spi_do load is pipelined back one cycle using this mechanism + spi_di_load2 = Signal() + spi_ack_pipe = Signal() + self.sync += [ + # pipelining is required the MISO path is very slow (IOB->fabric FD), and a falling-edge retiming reg is used to meet timing + spi_di_load2.eq(spi_di_load), + If(spi_di_load2, spi_di.eq(Cat(self.miso, spi_si[:-1]))).Else(spi_di.eq(spi_di)), + spi_ack.eq(spi_ack_pipe), + ] + self.comb += self.mosi.eq(spi_so[7]) + self.sync += spi_si.eq(Cat(self.miso, spi_si[:-1])) + self.submodules.spiphy = spiphy = FSM(reset_state="RESET") + spiphy.act("RESET", + If(spi_req, + NextState("REQ"), + NextValue(spicount, 7), + NextValue(spi_clk_en, 1), + NextValue(spi_so, spi_do), + NextValue(spi_dummy, has_dummy), + ).Else( + NextValue(spi_clk_en, 0), + NextValue(spi_ack_pipe, 0), + NextValue(spicount, 0), + NextValue(spi_dummy, 0), + ) + ) + spiphy.act("REQ", + If(spicount > 0, + NextValue(spicount, spicount - 1), + NextValue(spi_clk_en, 1), + NextValue(spi_so, Cat(0, spi_so[:-1])), + NextValue(spi_ack_pipe, 0), + ).Elif((spicount == 0) & spi_req & ~spi_dummy, # back-to-back transaction + NextValue(spi_clk_en, 1), + NextValue(spicount, 7), + NextValue(spi_clk_en, 1), + NextValue(spi_so, spi_do), # reload the so register + spi_di_load.eq(1), # "naked" .eq() create single-cycle pulses that default back to 0 + NextValue(spi_ack_pipe, 1), + NextValue(spi_dummy, has_dummy), + ).Elif((spicount == 0) & ~spi_req & ~spi_dummy, # go back to idle + spi_di_load.eq(1), + NextValue(spi_ack_pipe, 1), + NextValue(spi_clk_en, 0), + NextState("RESET"), + ).Elif((spicount == 0) & spi_dummy, + spi_di_load.eq(1), + NextValue(spicount, self.config.fields.dummy), + NextValue(spi_clk_en, 1), + NextValue(spi_ack_pipe, 0), + NextValue(spi_so, 0), # do a dummy with '0' as the output + NextState("DUMMY"), + ) # this actually should be a fully defined situation, no "Else" applicable + ) + spiphy.act("DUMMY", + If(spicount > 1, # instead of doing dummy-1, we stop at count == 1 + NextValue(spicount, spicount - 1), + NextValue(spi_clk_en, 1), + ).Elif(spicount <= 1 & spi_req, + NextValue(spi_clk_en, 1), + NextValue(spicount, 7), + NextValue(spi_so, spi_do), # reload the so register + NextValue(spi_ack_pipe, 1), # finally ack the cycle + NextValue(spi_dummy, has_dummy), + ).Else( + NextValue(spi_clk_en, 0), + NextValue(spi_ack_pipe, 1), # finally ack the cycle + NextState("RESET") + ) + ) + + # SPI MAC machine ------------------------------------------------------------------------------- + # default active on boot + addr_updated = Signal() + d_to_wb = Signal(32) # data going back to wishbone + mac_count = Signal(5) + new_cycle = Signal(1) + self.submodules.mac = mac = FSM(reset_state="RESET") + mac.act("RESET", + NextValue(self.spi_mode, 1), + NextValue(addr_updated, 0), + NextValue(d_to_wb, 0), + NextValue(spi_cs_n, 1), + NextValue(has_dummy, 0), + NextValue(spi_do, 0), + NextValue(spi_req, 0), + NextValue(mac_count, 0), + NextState("WAKEUP_PRE"), + NextValue(new_cycle, 1), + If(self.spi_mode, NextValue(self.bus.ack, 0)), + ) + if spiread: + mac.act("IDLE", + If(self.spi_mode, # this machine stays in idle once spi_mode is dropped + NextValue(self.bus.ack, 0), + If((self.bus.cyc == 1) & (self.bus.stb == 1) & (self.bus.we == 0) & (self.bus.cti != 7), + # read cycle requested, not end-of-burst + If((rom_addr[2:] != self.bus.adr) & new_cycle, + NextValue(rom_addr, Cat(Signal(2, reset=0), self.bus.adr)), + NextValue(addr_updated, 1), + NextValue(spi_cs_n, 1), # raise CS in anticipation of a new address cycle + NextState("SPI_READ_32_CS"), + ).Elif((rom_addr[2:] == self.bus.adr) | (~new_cycle & self.bus.cti == 2), + NextValue(mac_count, 3), # get another beat of 4 bytes at the next address + NextState("SPI_READ_32") + ).Else( + NextValue(addr_updated, 0), + NextValue(spi_cs_n, 0), + NextState("SPI_READ_32"), + NextValue(mac_count, 3), # prep the MAC state counter to count out 4 bytes + ), + ).Elif(self.command.fields.wakeup, + NextValue(spi_cs_n, 1), + NextValue(self.command.storage, 0), # clear all pending commands + NextState("WAKEUP_PRE"), + ) + ) + ) + else: + mac.act("IDLE", + If(self.spi_mode, # this machine stays in idle once spi_mode is dropped + If(self.command.fields.wakeup, + NextValue(spi_cs_n, 1), + NextValue(self.command.storage, 0), # clear all pending commands + NextState("WAKEUP_PRE"), + ) + ) + ) + + # --------- wakup chip ------------------------------ + mac.act("WAKEUP_PRE", + NextValue(spi_cs_n, 1), # why isn't this sticking? i shouldn't have to put this here + NextValue(mac_count, 4), + NextState("WAKEUP_PRE_CS_WAIT") + ) + mac.act("WAKEUP_PRE_CS_WAIT", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextState("WAKEUP_WUP"), + NextValue(spi_cs_n, 0), + ) + ) + mac.act("WAKEUP_WUP", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextValue(spi_do, 0xab), # wakeup from deep sleep + NextValue(spi_req, 1), + NextState("WAKEUP_WUP_WAIT"), + ) + ) + mac.act("WAKEUP_WUP_WAIT", + NextValue(spi_req, 0), + If(spi_ack, + NextValue(spi_cs_n, 1), # raise CS + NextValue(mac_count, 4), # for >4 cycles per specsheet + NextState("WAKEUP_CR2_WREN_1") + ) + ) + + # --------- WREN+CR2 - dummy cycles ------------------------------ + mac.act("WAKEUP_CR2_WREN_1", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextValue(spi_do, 0x06), # WREN to unlock CR2 writing + NextValue(spi_req, 1), + NextState("WAKEUP_CR2_WREN_1_WAIT"), + ) + ) + mac.act("WAKEUP_CR2_WREN_1_WAIT", + NextValue(spi_req, 0), + If(spi_ack, + NextValue(spi_cs_n, 1), + NextValue(mac_count, 4), + NextState("WAKEUP_CR2_DUMMY_CMD"), + ) + ) + mac.act("WAKEUP_CR2_DUMMY_CMD", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextValue(spi_do, 0x72), # CR2 command + NextValue(spi_req, 1), + NextValue(mac_count, 2), + NextState("WAKEUP_CR2_DUMMY_ADRHI"), + ) + ) + mac.act("WAKEUP_CR2_DUMMY_ADRHI", + NextValue(spi_do, 0x00), # we want to send 00_00_03_00 + If(spi_ack, + NextValue(mac_count, mac_count - 1), + ), + If(mac_count == 0, + NextState("WAKEUP_CR2_DUMMY_ADRMID") + ) + ) + mac.act("WAKEUP_CR2_DUMMY_ADRMID", + NextValue(spi_do, 0x03), + If(spi_ack, NextState("WAKEUP_CR2_DUMMY_ADRLO")), + ) + mac.act("WAKEUP_CR2_DUMMY_ADRLO", + NextValue(spi_do, 0x00), + If(spi_ack, NextState("WAKEUP_CR2_DUMMY_DATA")), + ) + mac.act("WAKEUP_CR2_DUMMY_DATA", + NextValue(spi_do, 0x05), # 10 dummy cycles as required for 84MHz-104MHz operation + If(spi_ack, NextState("WAKEUP_CR2_DUMMY_WAIT")), + ) + mac.act("WAKEUP_CR2_DUMMY_WAIT", + NextValue(spi_req, 0), + If(spi_ack, + NextValue(spi_cs_n, 1), + NextValue(mac_count, 4), + NextState("WAKEUP_CR2_WREN_2") + ) + ) + + # --------- WREN+CR2 to DOPI mode ------------------------------ + mac.act("WAKEUP_CR2_WREN_2", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextValue(spi_do, 0x06), # WREN to unlock CR2 writing + NextValue(spi_req, 1), + NextState("WAKEUP_CR2_WREN_2_WAIT"), + ) + ) + mac.act("WAKEUP_CR2_WREN_2_WAIT", + NextValue(spi_req, 0), + If(spi_ack, + NextValue(spi_cs_n, 1), + NextValue(mac_count, 4), + NextState("WAKEUP_CR2_DOPI_CMD"), + ) + ) + mac.act("WAKEUP_CR2_DOPI_CMD", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextValue(spi_do, 0x72), # CR2 command + NextValue(spi_req, 1), + NextValue(mac_count, 4), + NextState("WAKEUP_CR2_DOPI_ADR"), + ) + ) + mac.act("WAKEUP_CR2_DOPI_ADR", # send 0x00_00_00_00 as address + NextValue(spi_do, 0x00), # no need to raise CS or lower spi_req, this is back-to-back + If(spi_ack, + NextValue(mac_count, mac_count - 1), + ), + If(mac_count == 0, + NextState("WAKEUP_CR2_DOPI_DATA"), + ) + ), + mac.act("WAKEUP_CR2_DOPI_DATA", + NextValue(spi_do, 2), # enable DOPI mode + If(spi_ack, NextState("WAKEUP_CR2_DOPI_WAIT")), + ) + mac.act("WAKEUP_CR2_DOPI_WAIT", # trailing CS wait + NextValue(spi_req, 0), + If(spi_ack, + NextValue(spi_cs_n, 1), + NextValue(mac_count, 4), + NextState("WAKEUP_CS_EXIT") + ) + ) + mac.act("WAKEUP_CS_EXIT", + NextValue(self.spi_mode, 0), # now enter DOPI mode + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextState("IDLE"), + ) + ) + + if spiread: + # --------- SPI read machine ------------------------------ + mac.act("SPI_READ_32", + If(addr_updated, + NextState("SPI_READ_32_CS"), + NextValue(has_dummy, 0), + NextValue(mac_count, 3), + NextValue(spi_cs_n, 1), + NextValue(spi_req, 0), + ).Else( + If(mac_count > 0, + NextValue(has_dummy, 0), + NextValue(spi_req, 1), + NextState("SPI_READ_32_D") + ).Else( + NextValue(spi_req, 0), + If(spi_ack, + If(self.spi_mode, + # protect these in a spi_mode mux to prevent excess inference of logic to handle otherwise implicit dual-master situation + NextValue(self.bus.dat_r, Cat(d_to_wb[8:], spi_di)), + NextValue(self.bus.ack, 1), + ), + NextValue(rom_addr, rom_addr + 1), + NextState("IDLE") + ) + ) + ) + ) + mac.act("SPI_READ_32_D", + If(spi_ack, + # shift in one byte at a time to d_to_wb(32) + NextValue(d_to_wb, Cat(d_to_wb[8:], spi_di, )), + NextValue(mac_count, mac_count - 1), + NextState("SPI_READ_32"), + NextValue(rom_addr, rom_addr + 1), + ) + ) + mac.act("SPI_READ_32_CS", + NextValue(mac_count, mac_count - 1), + If(mac_count == 0, + NextValue(spi_cs_n, 0), + NextState("SPI_READ_32_A0"), + ) + ) + mac.act("SPI_READ_32_A0", + NextValue(spi_do, 0x0c), # 32-bit address write for "fast read" command + NextValue(spi_req, 1), + NextState("SPI_READ_32_A1"), + ) + mac.act("SPI_READ_32_A1", + NextValue(spi_do, rom_addr[24:] & 0x7), + # queue up MSB to send, leave req high; mask off unused high bits + If(spi_ack, + NextState("SPI_READ_32_A2"), + ) + ) + mac.act("SPI_READ_32_A2", + NextValue(spi_do, rom_addr[16:24]), + If(spi_ack, + NextState("SPI_READ_32_A3"), + ) + ) + mac.act("SPI_READ_32_A3", + NextValue(spi_do, rom_addr[8:16]), + If(spi_ack, + NextState("SPI_READ_32_A4"), + ) + ) + mac.act("SPI_READ_32_A4", + NextValue(spi_do, rom_addr[:8]), + If(spi_ack, + NextState("SPI_READ_32_A5"), + ) + ) + mac.act("SPI_READ_32_A5", + NextValue(spi_do, 0), + If(spi_ack, + NextState("SPI_READ_32_DUMMY") + ) + ) + mac.act("SPI_READ_32_DUMMY", + NextValue(spi_req, 0), + NextValue(addr_updated, 0), + If(spi_ack, + NextState("SPI_READ_32"), + NextValue(mac_count, 3), # prep the MAC state counter to count out 4 bytes + ).Else( + NextState("SPI_READ_32_DUMMY") + ) + ) + + # Handle ECS_n ----------------------------------------------------------------------------- + # treat ECS_N as an async signal -- just a "rough guide" of problems + ecs_n = Signal() + self.specials += MultiReg(pads.ecs_n, ecs_n) + + self.submodules.ev = EventManager() + self.ev.ecc_error = EventSourceProcess() # Falling edge triggered + self.ev.finalize() + self.comb += self.ev.ecc_error.trigger.eq(ecs_n) + ecc_reported = Signal() + ecs_n_delay = Signal() + ecs_pulse = Signal() + + self.ecc_address = CSRStatus(fields=[ + CSRField("ecc_address", size=32, description="Address of the most recent ECC event") + ]) + self.ecc_status = CSRStatus(fields=[ + CSRField("ecc_error", size=1, + description="Live status of the ECS_N bit (ECC error on current packet when low)"), + CSRField("ecc_overflow", size=1, + description="More than one ECS_N event has happened since th last time ecc_address was checked") + ]) + + self.comb += self.ecc_status.fields.ecc_error.eq(ecs_n) + self.comb += [ + ecs_pulse.eq(ecs_n_delay & ~ecs_n), # falling edge -> positive pulse + If(ecs_pulse, + self.ecc_address.fields.ecc_address.eq(rom_addr), + If(ecc_reported, + self.ecc_status.fields.ecc_overflow.eq(1) + ).Else( + self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow), + ) + ).Else( + self.ecc_address.fields.ecc_address.eq(self.ecc_address.fields.ecc_address), + If(self.ecc_status.we, + self.ecc_status.fields.ecc_overflow.eq(0), + ).Else( + self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow), + ) + ) + ] + self.sync += [ + ecs_n_delay.eq(ecs_n), + If(ecs_pulse, + ecc_reported.eq(1) + ).Elif(self.ecc_address.we, + ecc_reported.eq(0) + ) + ]