From cc6ed667dfc28d7e869f011f92ee286ee944a019 Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Thu, 6 Feb 2020 17:25:00 +0800
Subject: [PATCH 1/5] Request to merge I2S and SPIOPI cores

---
 litex/soc/cores/i2s.py    |  435 +++++++++++++++
 litex/soc/cores/spiopi.py | 1080 +++++++++++++++++++++++++++++++++++++
 2 files changed, 1515 insertions(+)
 create mode 100644 litex/soc/cores/i2s.py
 create mode 100644 litex/soc/cores/spiopi.py

diff --git a/litex/soc/cores/i2s.py b/litex/soc/cores/i2s.py
new file mode 100644
index 000000000..1125f7920
--- /dev/null
+++ b/litex/soc/cores/i2s.py
@@ -0,0 +1,435 @@
+# This file is Copyright (c) 2020 bunnie <bunnie@kosagi.com>
+# License: BSD
+
+from litex.soc.interconnect.csr_eventmanager import *
+from litex.soc.interconnect import wishbone
+from litex.soc.integration.doc import AutoDoc, ModuleDoc
+from migen.genlib.cdc import MultiReg
+
+class i2s_slave(Module, AutoCSR, AutoDoc):
+    def __init__(self, pads, fifodepth=256):
+        self.intro = ModuleDoc("""
+        Intro
+        *******
+        
+        I2S slave creates a slave audio interface instance. Tx and Rx interfaces are inferred
+        based upon the presence or absence of the respective pins in the "pads" argument.
+        
+        The interface is I2S-like, but note the deviation that the bits are justified
+        left without a 1-bit pad after sync edges. This isn't a problem for talking to the LM49352
+        codec this was designed for, as the bit offset is programmable, but this will not work well if
+        are talking to a CODEC without a programmable bit offset!
+        
+        System Interface
+        =================
+        
+        Audio interchange is done with the system using 16-bit stereo samples, with the right channel
+        mapped to the least significant word of a 32-bit word. Thus each 32-bit word is a single
+        stereo sample. As this is a slave I2S interface, sampling rate and framing is set by the programming 
+        of the audio CODEC chip. A slave situation is preferred because this defers the generation of
+        audio clocks to the CODEC, which has PLLs specialized to generate the correct frequencies for
+        audio sampling rates.
+          
+        `fifodepth` is the depth at which either a read interrupt is fired (guaranteeing at least `fifodepth`
+        stereo samples in the receive FIFO) or a write interrupt is fired (guaranteeing at least `fifodepth`
+        free space in the transmit FIFO). The maximum depth is 512.
+        
+        To receive audio data:
+        
+        - reset the Rx FIFO, to guarantee all pointers at zero
+        - hook the Rx full interrupt with an interrupt handler (optional)
+        - if the CODEC is not yet transmitting data, initiate data transmission
+        - enable Rx FIFO to run
+        - poll or wait for interrupt; upon interrupt, read `fifodepth` words. Repeat.
+        - to close the stream, simply clear the Rx FIFO enable bit. The next initiation should call a
+          reset of the FIFO to ensure leftover previous data is cleared from the FIFO. 
+        
+        To transmit audio data:
+        
+        - reset the Tx FIFO, to guarantee all pointers at zero
+        - hook the Tx available interrupt with an interrupt handler (optional)
+        - write 512 words of data into the Tx FIFO, filling it to the max
+        - if the CODEC is not yet requesting data and unmuted, unmute and initiate reception 
+        - enable the Tx FIFO to run
+        - poll or wait for interrupt; upon interrupt, write `fifodepth` words. Repeat.
+        - to close stream, mute the DAC and stop the request clock. Ideally, this can be completed before
+          the FIFO is emptied, so there is no jarring pop or truncation of data
+        - stop FIFO running. Next initiation should reset the FIFO to ensure leftover previous data in
+          FIFO is cleared. 
+        
+        CODEC Interface
+        ================
+        
+        The interface assumes we have a sysclk domain running around 100MHz, and that our typical
+        max audio rate is 44.1kHz * 24bits * 2channels = 2.1168MHz audio clock. Thus, the architecture
+        treats the audio clock and data as asynchronous inputs that are MultiReg-syncd into the clock
+        domain. Probably the slowest sysclk rate this might work with is around 20-25MHz (10x over
+        sampling), but at 100MHz things will be quite comfortable.
+        
+        The upside of the fully asynchronous implementation is that we can leave the I/O unconstrained,
+        giving the place/route more latitude to do its job.
+        
+        Here's the timing format targeted by this I2S interface:
+        
+            .. wavedrom::
+                :caption: Timing format of the I2S interface
+
+                { "signal" : [
+                  { "name": "clk",         "wave": "n....|.......|......" },
+                  { "name": "sync",        "wave": "1.0..|....1..|....0." },
+                  { "name": "tx/rx",       "wave": ".====|==x.===|==x.=x", "data": ["L15", "L14", "...", "L1", "L0", "R15", "R14", "...", "R1", "R0", "L15"] },
+                ]}
+        
+        - Data is updated on the falling edge
+        - Data is sampled on the rising edge
+        - Words are MSB-to-LSB, left-justified (**NOTE: this is a deviation from strict I2S, which offsets by 1 from the left**)
+        - Sync is an input (FPGA is slave, codec is master): low => left channel, high => right channel
+        - Sync can be longer than the wordlen, extra bits are just ignored
+        - Tx is data to the codec (SDI pin on LM49352)
+        - Rx is data from the codec (SDO pin on LM49352)
+        
+        """)
+
+        # one cache line is 8 32-bit words, need to always have enough space for one line or else nothing works
+        if fifodepth > 504:
+            fifodepth = 504
+            print("I2S warning: fifo depth greater than 504 selected; truncating to 504")
+        if fifodepth < 8:
+            fifodepth = 8
+            print("I2S warning: fifo depth less than 8 selected; truncating to 8")
+
+        # connect pins, synchronizers, and edge detectors
+        if hasattr(pads, 'tx'):
+            tx_pin = Signal()
+            self.comb += pads.tx.eq(tx_pin)
+        if hasattr(pads, 'rx'):
+            rx_pin = Signal()
+            self.specials += MultiReg(pads.rx, rx_pin)
+        sync_pin = Signal()
+        self.specials += MultiReg(pads.sync, sync_pin)
+
+        clk_pin = Signal()
+        self.specials += MultiReg(pads.clk, clk_pin)
+        clk_d = Signal()
+        self.sync += clk_d.eq(clk_pin)
+        rising_edge = Signal()
+        falling_edge = Signal()
+        self.comb += [rising_edge.eq(clk_pin & ~clk_d), falling_edge.eq(~clk_pin & clk_d)]
+
+        # wishbone bus
+        self.bus = wishbone.Interface()
+        rd_ack = Signal()
+        wr_ack = Signal()
+        self.comb +=[
+            If(self.bus.we,
+               self.bus.ack.eq(wr_ack),
+            ).Else(
+                self.bus.ack.eq(rd_ack),
+            )
+        ]
+
+        # interrupts
+        self.submodules.ev = EventManager()
+        if hasattr(pads, 'rx'):
+            self.ev.rx_ready = EventSourcePulse()  # rising edge triggered, indicates FIFO is ready to read
+            self.ev.rx_error = EventSourcePulse()  # indicates a rx error has happened
+        if hasattr(pads, 'tx'):
+            self.ev.tx_ready = EventSourcePulse()  # indicates space available in Tx buffer for next quantum
+            self.ev.tx_error = EventSourcePulse()  # indicates a tx error has happened
+        self.ev.finalize()
+
+
+        # build the RX subsystem
+        if hasattr(pads, 'rx'):
+            rx_rd_d = Signal(32)
+            rx_almostfull = Signal()
+            rx_almostempty = Signal()
+            rx_full = Signal()
+            rx_empty = Signal()
+            rx_rdcount = Signal(9)
+            rx_rderr = Signal()
+            rx_wrerr = Signal()
+            rx_wrcount = Signal(9)
+            rx_rden = Signal()
+            rx_wr_d = Signal(32)
+            rx_wren = Signal()
+
+            self.rx_ctl = CSRStorage(description="Rx data path control",
+                fields=[
+                    CSRField("enable", size=1, description="Enable the receiving data"),
+                    CSRField("reset", size=1, description="Writing `1` resets the FIFO. Reset happens regardless of enable state.", pulse=1)
+                ])
+            self.rx_stat = CSRStatus(description="Rx data path status",
+                fields=[
+                    CSRField("overflow", size=1, description="Rx overflow"),
+                    CSRField("underflow", size=1, description="Rx underflow"),
+                    CSRField("dataready", size=1, description="{} words of data loaded and ready to read".format(fifodepth)),
+                    CSRField("empty", size=1, description="No data available in FIFO to read"), # next flags probably never used
+                    CSRField("wrcount", size=9, description="Write count"),
+                    CSRField("rdcount", size=9, description="Read count"),
+                    CSRField("fifodepth", size=9, description="FIFO depth as synthesized")
+                ])
+            self.comb += self.rx_stat.fields.fifodepth.eq(fifodepth)
+
+            rx_rst_cnt = Signal(3)
+            rx_reset = Signal()
+            self.sync += [
+                If(self.rx_ctl.fields.reset,
+                   rx_rst_cnt.eq(5),  # 5 cycles reset required by design
+                   rx_reset.eq(1)
+                ).Else(
+                    If(rx_rst_cnt == 0,
+                       rx_reset.eq(0)
+                    ).Else(
+                        rx_rst_cnt.eq(rx_rst_cnt - 1),
+                        rx_reset.eq(1)
+                    )
+                )
+            ]
+            # at a width of 32 bits, an 18kiB fifo is 512 entries deep
+            self.specials += Instance("FIFO_SYNC_MACRO",
+                                p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32,
+                                p_ALMOST_EMPTY_OFFSET=8, p_ALMOST_FULL_OFFSET=(512 - fifodepth),
+                                p_DO_REG=0,
+
+                                o_ALMOSTFULL=rx_almostfull, o_ALMOSTEMPTY=rx_almostempty,
+                                o_DO=rx_rd_d, o_EMPTY=rx_empty, o_FULL=rx_full,
+                                o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr,
+                                i_DI=rx_wr_d, i_CLK=ClockSignal(), i_RDEN=rx_rden & ~rx_reset,
+                                i_WREN=rx_wren & ~rx_reset, i_RST=rx_reset,
+            )
+            self.comb += [  # wire up the status signals and interrupts
+                self.rx_stat.fields.overflow.eq(rx_wrerr),
+                self.rx_stat.fields.underflow.eq(rx_rderr),
+                self.rx_stat.fields.dataready.eq(rx_almostfull),
+                self.rx_stat.fields.wrcount.eq(rx_wrcount),
+                self.rx_stat.fields.rdcount.eq(rx_rdcount),
+                self.ev.rx_ready.trigger.eq(rx_almostfull),
+                self.ev.rx_error.trigger.eq(rx_wrerr | rx_rderr),
+            ]
+            bus_read = Signal()
+            bus_read_d = Signal()
+            rd_ack_pipe = Signal()
+            self.comb += bus_read.eq(self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 0))
+            self.sync += [  # this is the bus responder -- only works for uncached memory regions
+                bus_read_d.eq(bus_read),
+                If(bus_read & ~bus_read_d, # one response, one cycle
+                   rd_ack_pipe.eq(1),
+                   If(~rx_empty,
+                      self.bus.dat_r.eq(rx_rd_d),
+                      rx_rden.eq(1),
+                   ).Else(
+                       self.bus.dat_r.eq(0xDEADBEEF),  # don't stall the bus indefinitely if we try to read from an empty fifo...just return garbage
+                       rx_rden.eq(0),
+                   )
+                ).Else(
+                    rx_rden.eq(0),
+                    rd_ack_pipe.eq(0),
+                ),
+                rd_ack.eq(rd_ack_pipe),
+            ]
+
+            rx_cnt = Signal(5)
+            self.submodules.rxi2s = rxi2s = FSM(reset_state="IDLE")
+            rxi2s.act("IDLE",
+                      NextValue(rx_wr_d, 0),
+                      If(self.rx_ctl.fields.enable,
+                         If(rising_edge & sync_pin, # wait_sync guarantees we start at the beginning of a left frame, and not in the middle
+                            NextState("WAIT_SYNC"),
+                         )
+                      )
+            ),
+            rxi2s.act("WAIT_SYNC",
+                      If(rising_edge & ~sync_pin,
+                         NextState("LEFT"),
+                         NextValue(rx_cnt, 16),
+                         ),
+            )
+            rxi2s.act("LEFT",
+                      If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else(
+                          NextValue(rx_wr_d, Cat(rx_pin, rx_wr_d[:-1])),
+                          NextValue(rx_cnt, rx_cnt - 1),
+                          NextState("LEFT_WAIT"),
+                      )
+            )
+            rxi2s.act("LEFT_WAIT",
+                      If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else(
+                          If(rising_edge,
+                              If((rx_cnt == 0) & sync_pin,
+                                 NextValue(rx_cnt, 16),
+                                 NextState("RIGHT")
+                              ).Elif(rx_cnt > 0,
+                                NextState("LEFT"),
+                              )
+                          )
+                      )
+            )
+            rxi2s.act("RIGHT",
+                      If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else(
+                          NextValue(rx_wr_d, Cat(rx_pin, rx_wr_d[:-1])),
+                          NextValue(rx_cnt, rx_cnt - 1),
+                          NextState("RIGHT_WAIT"),
+                      )
+            )
+            rxi2s.act("RIGHT_WAIT",
+                      If(~self.rx_ctl.fields.enable, NextState("IDLE")).Else(
+                          If(rising_edge,
+                              If((rx_cnt == 0) & ~sync_pin,
+                                 NextValue(rx_cnt, 16),
+                                 NextState("LEFT"),
+                                 rx_wren.eq(1), # pulse rx_wren to write the current data word
+                              ).Elif(rx_cnt > 0,
+                                NextState("RIGHT"),
+                              )
+                          )
+                      )
+            )
+
+
+        # build the TX subsystem
+        if hasattr(pads, 'tx'):
+            tx_rd_d = Signal(32)
+            tx_almostfull = Signal()
+            tx_almostempty = Signal()
+            tx_full = Signal()
+            tx_empty = Signal()
+            tx_rdcount = Signal(9)
+            tx_rderr = Signal()
+            tx_wrerr = Signal()
+            tx_wrcount = Signal(9)
+            tx_rden = Signal()
+            tx_wr_d = Signal(32)
+            tx_wren = Signal()
+
+            self.tx_ctl = CSRStorage(description="Tx data path control",
+                fields=[
+                    CSRField("enable", size=1, description="Enable the transmission data"),
+                    CSRField("reset", size=1, description="Writing `1` resets the FIFO. Reset happens regardless of enable state.", pulse=1)
+                ])
+            self.tx_stat = CSRStatus(description="Tx data path status",
+                fields=[
+                    CSRField("overflow", size=1, description="Tx overflow"),
+                    CSRField("underflow", size=1, description="Tx underflow"),
+                    CSRField("free", size=1, description="At least {} words of space free".format(fifodepth)),
+                    CSRField("almostfull", size=1, description="Less than 8 words space available"), # the next few flags should be rarely used
+                    CSRField("full", size=1, description="FIFO is full or overfull"),
+                    CSRField("empty", size=1, description="FIFO is empty"),
+                    CSRField("wrcount", size=9, description="Tx write count"),
+                    CSRField("rdcount", size=9, description="Tx read count"),
+                ])
+
+            tx_rst_cnt = Signal(3)
+            tx_reset = Signal()
+            self.sync += [
+                If(self.tx_ctl.fields.reset,
+                   tx_rst_cnt.eq(5),  # 5 cycles reset required by design
+                   tx_reset.eq(1)
+                ).Else(
+                    If(tx_rst_cnt == 0,
+                       tx_reset.eq(0)
+                    ).Else(
+                        tx_rst_cnt.eq(tx_rst_cnt - 1),
+                        tx_reset.eq(1)
+                    )
+                )
+            ]
+            # at a width of 32 bits, an 18kiB fifo is 512 entries deep
+            self.specials += Instance("FIFO_SYNC_MACRO",
+                                p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32,
+                                p_ALMOST_EMPTY_OFFSET=fifodepth, p_ALMOST_FULL_OFFSET=8,
+                                p_DO_REG=0,
+
+                                o_ALMOSTFULL=tx_almostfull, o_ALMOSTEMPTY=tx_almostempty,
+                                o_DO=tx_rd_d, o_EMPTY=tx_empty, o_FULL=tx_full,
+                                o_RDCOUNT=tx_rdcount, o_RDERR=tx_rderr, o_WRCOUNT=tx_wrcount, o_WRERR=tx_wrerr,
+                                i_DI=tx_wr_d, i_CLK=ClockSignal(), i_RDEN=tx_rden & ~tx_reset,
+                                i_WREN=tx_wren & ~tx_reset, i_RST=tx_reset,
+            )
+            self.comb += [  # wire up the status signals and interrupts
+                self.tx_stat.fields.overflow.eq(tx_wrerr),
+                self.tx_stat.fields.underflow.eq(tx_rderr),
+                self.tx_stat.fields.free.eq(tx_almostempty),
+                self.tx_stat.fields.almostfull.eq(tx_almostfull),
+                self.tx_stat.fields.full.eq(tx_full),
+                self.tx_stat.fields.empty.eq(tx_empty),
+                self.tx_stat.fields.rdcount.eq(tx_rdcount),
+                self.tx_stat.fields.wrcount.eq(tx_wrcount),
+                self.ev.tx_ready.trigger.eq(tx_almostempty),
+                self.ev.tx_error.trigger.eq(tx_wrerr | tx_rderr),
+            ]
+            self.sync += [  # this is the bus responder -- need to check how this interacts with uncached memory region
+                If(self.bus.cyc & self.bus.stb & self.bus.we & ~self.bus.ack,
+                   If(~tx_full,
+                      tx_wr_d.eq(self.bus.dat_w),
+                      tx_wren.eq(1),
+                      wr_ack.eq(1),
+                   ).Else(
+                       tx_wren.eq(0),
+                       wr_ack.eq(0),
+                   )
+                ).Else(
+                    tx_wren.eq(0),
+                    wr_ack.eq(0),
+                )
+            ]
+
+            tx_cnt = Signal(5)
+            tx_buf = Signal(32)
+            self.submodules.txi2s = txi2s = FSM(reset_state="IDLE")
+            txi2s.act("IDLE",
+                      If(self.tx_ctl.fields.enable,
+                         If(falling_edge & sync_pin,
+                            NextState("WAIT_SYNC"),
+                         )
+                      )
+            ),
+            txi2s.act("WAIT_SYNC",
+                      If(falling_edge & ~sync_pin,
+                         NextState("LEFT"),
+                         NextValue(tx_cnt, 16),
+                         NextValue(tx_buf, tx_rd_d),
+                         tx_rden.eq(1),
+                      ),
+            )
+            txi2s.act("LEFT",
+                      If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else(
+                          NextValue(tx_pin, tx_buf[31]),
+                          NextValue(tx_buf, Cat(0, tx_buf[:-1])),
+                          NextValue(tx_cnt, tx_cnt - 1),
+                          NextState("LEFT_WAIT"),
+                      )
+            )
+            txi2s.act("LEFT_WAIT",
+                      If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else(
+                          If(falling_edge,
+                              If((tx_cnt == 0) & sync_pin,
+                                 NextValue(tx_cnt, 16),
+                                 NextState("RIGHT")
+                              ).Elif(tx_cnt > 0,
+                                NextState("LEFT"),
+                              )
+                          )
+                      )
+            )
+            txi2s.act("RIGHT",
+                      If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else(
+                          NextValue(tx_pin, tx_buf[31]),
+                          NextValue(tx_buf, Cat(0, tx_buf[:-1])),
+                          NextValue(tx_cnt, tx_cnt - 1),
+                          NextState("RIGHT_WAIT"),
+                      )
+            )
+            txi2s.act("RIGHT_WAIT",
+                      If(~self.tx_ctl.fields.enable, NextState("IDLE")).Else(
+                          If(falling_edge,
+                              If((tx_cnt == 0) & ~sync_pin,
+                                 NextValue(tx_cnt, 16),
+                                 NextState("LEFT"),
+                                 NextValue(tx_buf, tx_rd_d),
+                                 tx_rden.eq(1),
+                              ).Elif(tx_cnt > 0,
+                                NextState("RIGHT"),
+                              )
+                          )
+                      )
+            )
\ No newline at end of file
diff --git a/litex/soc/cores/spiopi.py b/litex/soc/cores/spiopi.py
new file mode 100644
index 000000000..941d65c62
--- /dev/null
+++ b/litex/soc/cores/spiopi.py
@@ -0,0 +1,1080 @@
+# This file is Copyright (c) 2020 bunnie <bunnie@kosagi.com>
+# License: BSD
+
+from litex.soc.interconnect.csr_eventmanager import *
+from litex.soc.interconnect import wishbone
+from litex.soc.integration.doc import AutoDoc, ModuleDoc
+from migen.genlib.cdc import MultiReg
+
+class SpiOpi(Module, AutoCSR, AutoDoc):
+    def __init__(self, pads, dq_delay_taps=31, sclk_name="SCLK_ODDR",
+                 iddr_name="SPI_IDDR", miso_name="MISO_FDRE", sim=False, spiread=False, prefetch_lines=1):
+        self.intro = ModuleDoc("""
+        Intro
+        ********
+
+        SpiOpi implements a dual-mode SPI or OPI interface. OPI is an octal (8-bit) wide
+        variant of SPI, which is unique to Macronix parts. It is concurrently interoperable
+        with SPI. The chip supports "DTR mode" (double transfer rate, e.g. DDR) where data
+        is transferred on each edge of the clock, and there is a source-synchronous DQS
+        associated with the input data.
+
+        The chip by default boots into SPI-only mode (unless NV bits are burned otherwise)
+        so to enable OPI, a config register needs to be written with SPI mode. Note that once
+        the config register is written, the only way to return to SPI mode is to change
+        it with OPI writes, or to issue a hardware reset. This has major implications for
+        reconfiguring the FPGA: a simple JTAG command to reload from SPI will not yank PROG_B low,
+        and so the SPI ROM will be in DOPI, and SPI loading will fail. Thus, system architects
+        must take into consideration a hard reset for the ROM whenever a bitstream reload
+        is demanded of the FPGA. 
+
+        The SpiOpi architecture is split into two levels: a command manager, and a
+        cycle manager. The command manager is responsible for taking the current wishbone
+        request and CSR state and unpacking these into cycle-by-cycle requests. The cycle
+        manager is responsible for coordinating the cycle-by-cycle requests. 
+
+        In SPI mode, this means marshalling byte-wide requests into a series of 8 serial cyles.
+
+        In OPI [DOPI] mode, this means marshalling 16-bit wide requests into a pair of back-to-back DDR
+        cycles. Note that because the cycles are DDR, this means one 16-bit wide request must be
+        issued every cycle to keep up with the interface. 
+
+        For the output of data to ROM, expects a clock called "spinor_delayed" which is a delayed 
+        version of "sys". The delay is necessary to get the correct phase relationship between 
+        the SIO and SCLK in DTR/DDR mode, and it also has to compensate for the special-case
+        difference in the CCLK pad vs other I/O.
+
+        For the input, DQS signal is independently delayed relative to the DQ signals using
+        an IDELAYE2 block. At a REFCLK frequency of 200 MHz, each delay tap adds 78ps, so up
+        to a 2.418ns delay is possible between DQS and DQ. The goal is to delay DQS relative
+        to DQ, because the SPI chip launches both with concurrent rising edges (to within 0.6ns),
+        but the IDDR register needs the rising edge of DQS to be centered inside the DQ eye.
+
+        In DOPI mode, there is a prefetch buffer. It will read `prefetch_lines` cache lines of 
+        data into the prefetch buffer. A cache line is 256 bits (or 8x32-bit words). The maximum 
+        value is 63 lines (one line is necessary for synchronization margin). The downside of
+        setting prefetch_lines high is that the prefetcher is running constantly and burning
+        power, while throwing away most data. In practice, the CPU will typically consume data
+        at only slightly faster than the rate of read-out from DOPI-mode ROM, and once data
+        is consumed the prefetch resumes. Thus, prefetch_lines is probably optimally around
+        1-3 lines read-ahead of the CPU. Any higher than 3 lines probably just wastes power.
+        In short simulations, 1 line of prefetch seems to be enough to keep the prefetcher
+        ahead of the CPU even when it's simply running straight-line code.
+
+        Note the "sim" parameter exists because there seems to be a bug in xvlog that doesn't
+        correctly simulate the IDELAY machines. Setting "sim" to True removes the IDELAY machines
+        and passes the data through directly, but in real hardware the IDELAY machines are
+        necessary to meet timing between DQS and DQ.  
+
+        dq_delay_taps probably doesn't need to be adjusted; it can be tweaked for timing
+        closure. The delays can also be adjusted at runtime.
+        """)
+        if prefetch_lines > 63:
+            prefetch_lines = 63
+
+        self.spi_mode = Signal(reset=1)  # when reset is asserted, force into spi mode
+        cs_n = Signal(reset=1)  # make sure CS is sane on reset, too
+
+        self.config = CSRStorage(fields=[
+            CSRField("dummy", size=5, description="Number of dummy cycles", reset=10),
+        ])
+
+        delay_type = "VAR_LOAD"
+
+        # DQS input conditioning -----------------------------------------------------------------
+        dqs_iobuf = Signal()
+        self.clock_domains.cd_dqs = ClockDomain(reset_less=True)
+        self.comb += self.cd_dqs.clk.eq(dqs_iobuf)
+        self.specials += [
+            Instance("BUFR", i_I=pads.dqs, o_O=dqs_iobuf),
+        ]
+
+        # DQ connections -------------------------------------------------------------------------
+        # PHY API
+        self.do = Signal(16)  # OPI data to SPI
+        self.di = Signal(16)  # OPI data from SPI
+        self.tx = Signal()  # when asserted OPI is transmitting data to SPI, otherwise, receiving
+
+        self.mosi = Signal()  # SPI data to SPI
+        self.miso = Signal()  # SPI data from SPI
+
+        # Delay programming API
+        self.delay_config = CSRStorage(fields=[
+            CSRField("d", size=5, description="Delay amount; each increment is 78ps", reset=31),
+            CSRField("load", size=1, description="Force delay taps to delay_d"),
+        ])
+        self.delay_status = CSRStatus(fields=[
+            CSRField("q", size=5, description="Readback of current delay amount, useful if inc/ce is used to set"),
+        ])
+        self.delay_update = Signal()
+        self.hw_delay_load = Signal()
+        self.sync += self.delay_update.eq(self.hw_delay_load | self.delay_config.fields.load)
+
+        # Break system API into rising/falling edge samples
+        do_rise = Signal(8)  # data output presented on the rising edge
+        do_fall = Signal(8)  # data output presented on the falling edge
+        self.comb += [do_rise.eq(self.do[8:]), do_fall.eq(self.do[:8])]
+
+        di_rise = Signal(8)
+        di_fall = Signal(8)
+        self.comb += self.di.eq(Cat(di_fall, di_rise))
+
+        # OPI DDR registers
+        dq = TSTriple(7)  # dq[0] is special because it is also MOSI
+        dq_delayed = Signal(8)
+        self.specials += dq.get_tristate(pads.dq[1:])
+        for i in range(1, 8):
+            self.specials += Instance("ODDR",
+                                      p_DDR_CLK_EDGE="SAME_EDGE",
+                                      i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
+                                      i_D1=do_rise[i], i_D2=do_fall[i], o_Q=dq.o[i - 1],
+                                      )
+            if sim == False:
+                if i == 1:  # only wire up o_CNTVALUEOUT for one instance
+                    self.specials += Instance("IDELAYE2",
+                                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE",
+                                              p_REFCLK_FREQUENCY=200.0,
+                                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps,
+                                              p_IDELAY_TYPE=delay_type,
+
+                                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0,
+                                              i_CE=0,
+                                              i_LD=self.delay_update,
+                                              i_CNTVALUEIN=self.delay_config.fields.d,
+                                              o_CNTVALUEOUT=self.delay_status.fields.q,
+                                              i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i],
+                                              ),
+                else:  # don't wire up o_CNTVALUEOUT for others
+                    self.specials += Instance("IDELAYE2",
+                                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE",
+                                              p_REFCLK_FREQUENCY=200.0,
+                                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps,
+                                              p_IDELAY_TYPE=delay_type,
+
+                                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0,
+                                              i_CE=0,
+                                              i_LD=self.delay_update,
+                                              i_CNTVALUEIN=self.delay_config.fields.d,
+                                              i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i],
+                                              ),
+            else:
+                self.comb += dq_delayed[i].eq(dq.i[i - 1])
+            self.specials += Instance("IDDR", name="{}{}".format(iddr_name, str(i)),
+                                      p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
+                                      i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
+                                      i_D=dq_delayed[i], o_Q1=di_rise[i], o_Q2=di_fall[i],
+                                      )
+        # SPI SDR register
+        self.specials += [
+            Instance("FDRE", name="{}".format(miso_name), i_C=~ClockSignal("spinor"), i_CE=1, i_R=0, o_Q=self.miso,
+                     i_D=dq_delayed[1],
+                     )
+        ]
+
+        # bit 0 (MOSI) is special-cased to handle SPI mode
+        dq_mosi = TSTriple(1)  # this has similar structure but an independent "oe" signal
+        self.specials += dq_mosi.get_tristate(pads.dq[0])
+        do_mux_rise = Signal()  # mux signal for mosi/dq select of bit 0
+        do_mux_fall = Signal()
+        self.specials += [
+            Instance("ODDR",
+                     p_DDR_CLK_EDGE="SAME_EDGE",
+                     i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
+                     i_D1=do_mux_rise, i_D2=do_mux_fall, o_Q=dq_mosi.o,
+                     ),
+            Instance("IDDR",
+                     p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
+                     i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
+                     o_Q1=di_rise[0], o_Q2=di_fall[0], i_D=dq_delayed[0],
+                     ),
+        ]
+        if sim == False:
+            self.specials += [
+                Instance("IDELAYE2",
+                         p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                         p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                         p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
+
+                         i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                         i_LD=self.delay_update,
+                         i_CNTVALUEIN=self.delay_config.fields.d,
+                         i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0],
+                         ),
+            ]
+        else:
+            self.comb += dq_delayed[0].eq(dq_mosi.i)
+
+        # wire up SCLK interface
+        clk_en = Signal()
+        self.specials += [
+            # de-activate the CCLK interface, parallel it with a GPIO
+            Instance("STARTUPE2",
+                     i_CLK=0, i_GSR=0, i_GTS=0, i_KEYCLEARB=0, i_PACK=0, i_USRDONEO=1, i_USRDONETS=1,
+                     i_USRCCLKO=0, i_USRCCLKTS=1,  # force to tristate
+                     ),
+            Instance("ODDR", name=sclk_name,  # need to name this so we can constrain it properly
+                     p_DDR_CLK_EDGE="SAME_EDGE",
+                     i_C=ClockSignal("spinor"), i_R=ResetSignal("spinor"), i_S=0, i_CE=1,
+                     i_D1=clk_en, i_D2=0, o_Q=pads.sclk,
+                     )
+        ]
+
+        # wire up CS_N
+        spi_cs_n = Signal()
+        opi_cs_n = Signal()
+        self.comb += cs_n.eq((self.spi_mode & spi_cs_n) | (~self.spi_mode & opi_cs_n))
+        self.specials += [
+            Instance("ODDR",
+                     p_DDR_CLK_EDGE="SAME_EDGE",
+                     i_C=ClockSignal(), i_R=0, i_S=ResetSignal(), i_CE=1,
+                     i_D1=cs_n, i_D2=cs_n, o_Q=pads.cs_n,
+                     ),
+        ]
+
+        self.architecture = ModuleDoc("""
+        Architecture
+        **************
+
+        The machine is split into two separate pieces, one to handle SPI, and one to handle OPI.
+
+        SPI
+        =====
+        The SPI machine architecture is split into two levels: MAC and PHY. 
+
+        The MAC layer is responsible for:
+        - receiving requests via CSR register to perform config/status/special command sequences,
+        and dispatching these to the SPI PHY
+        - translating wishbone bus requests into command sequences, and routing them to either OPI
+        or SPI PHY.
+        - managing the chip select to the chip, and ensuring that one dummy cycle is inserted after
+        chip select is asserted, or before it is de-asserted; and that the chip select "high" times
+        are adequate (1 cycle between reads, 4 cycles for all other operations)
+
+        On boot, the interface runs in SPI; once the wakeup sequence is executed, the chip permanently
+        switches to OPI mode unless the CR2 registers are written to fall back, or the 
+        reset to the chip is asserted.
+
+        The PHY layers are responsible for the following tasks:
+        - Serializing and deserializing data, standardized on 8 bits for SPI and 16 bits for OPI
+        - counting dummy cycles
+        - managing the clock enable
+
+        PHY cycles are initiated with a "req" signal, which is only sampled for 
+        one cycle and then ignored until the PHY issues an "ack" that the current cycle is complete. 
+        Thus holding "req" high can allow the PHY to back-to-back issue cycles without pause.
+
+        OPI
+        =====
+        The OPI machine is split into three parts: a command controller, a Tx PHY, and an Rx PHY. 
+
+        The Tx PHY is configured with a "dummy cycle" count register, as there is a variable length
+        delay for dummy cycles in OPI.
+
+        In OPI mode, read data is `mesochronous`, that is, they return at precisely the same frequency
+        as SCLK, but with an unknown phase relationship. The DQS strobe is provided as a "hint" to
+        the receiving side to help retime the data. The mesochronous nature of the read data is
+        why the Tx and Rx PHY must be split into two separate machines, as they are operating in
+        different clock domains.
+
+        DQS is implemented on the ROM as an extra data output that is guaranteed to change polarity with 
+        each data byte; the skew mismatch of DQS to data is within +/-0.6ns or so. It turns out the mere
+        act of routing the DQS into a BUFR buffer before clocking the data into an IDDR primitive
+        is sufficient to delay the DQS signal and meet setup and hold time on the IDDR. 
+
+        Once captured by the IDDR, the data is fed into a dual-clock FIFO to make the transition
+        from the DQS to sysclk domains cleanly. 
+
+        Because of the latency involved in going from pin->IDDR->FIFO, excess read cycles are
+        required beyond the end of the requested cache line. However, there is virtually no 
+        penalty in pre-filling the FIFO with data; if a new cache line has to be fetched, 
+        the FIFO can simply be reset and all pointers zeroed. In fact, pre-filling the FIFO
+        can lead to great performance benefits if sequential cache lines are requested. In
+        simulation, a cache line can be filled in 10 bus cycles if it happens to be prefetched
+        (as opposed to 49 bus cycles for random reads). Either way, this compares favorably to
+        288 cycles for random reads in 100MHz SPI mode (or 576 for the spimemio.v, which runs at 
+        50MHz).   
+
+        The command controller is repsonsible for sequencing all commands other than fast reads. Most
+        commands have some special-case structure to them, and as more commands are implemented, the
+        state machine is expected to grow fairly large. Fast reads are directly handled in "tx_run"
+        mode, where the TxPhy and RxPhy run a tight loop to watch incoming read bus cycles, check
+        the current address, fill the prefetch fifo, and respond to bus cycles. 
+
+        Writes to ROM might lock up the machine; a TODO is to test this and do something more sane,
+        like ignore writes by sending an ACK immediately while discarding the data.
+
+        Thus, an OPI read proceeds as follows:
+
+        - When BUS/STB are asserted:
+           TxPhy:
+
+           - capture bus_adr, and compare against the *next read* address pointer
+              - if they match, allow the PHYs to do the work
+
+           - if bus_adr and next read address don't match, save to next read address pointer, and 
+             cycle wr/rd clk for 5 cycle while asserting reset to reset the FIFO
+           - initiate an 8DTRD with the read address pointer
+           - wait the specified dummy cycles
+
+           - greedily pre-fill the FIFO by continuing to clock DQS until either:
+             - the FIFO is full
+             - pre-fetch is aborted because bus_adr and next read address don't match and FIFO is reset
+
+           RxPHY:
+           - while CTI==2, assemble data into 32-bit words as soon as EMPTY is deasserted,
+             present a bus_ack, and increment the next read address pointer
+           - when CTI==7, ack the data, and wait until the next bus cycle with CTI==2 to resume
+             reading
+
+        - A FIFO_SYNC_MACRO is used to instantiate the FIFO. This is chosen because:
+           - we can specify RAMB18's, which seem to be under-utilized by the auto-inferred memories by migen
+           - the XPM_FIFO_ASYNC macro claims no instantiation support, and also looks like it has weird
+             requirements for resetting the pointers: you must check the reset outputs, and the time to
+             reset is reported to be as high as around 200ns (anecdotally -- could be just that the sim I
+             read on the web is using a really slow clock, but I'm guessing it's around 10 cycles). 
+           - the FIFO_SYNC_MACRO has a well-specified fixed reset latency of 5 cycles.
+           - The main downside of FIFO_SYNC_MACRO over XPM_FIFO_ASYNC is that XPM_FIFO_ASYNC can automatically 
+             allow for output data to be read at 32-bit widths, with writes at 16-bit widths. However, with a
+             bit of additional logic and pipelining, we can aggregate data into 32-bit words going into a
+             32-bit FIFO_SYNC_MACRO, which is what we do in this implementation. 
+        """)
+        self.bus = wishbone.Interface()
+
+        self.command = CSRStorage(
+            description="Write individual bits to issue special commands to SPI; setting multiple bits at once leads to undefined behavior.",
+            fields=[
+                CSRField("wakeup", size=1, description="Sequence through init & wakeup routine"),
+                CSRField("sector_erase", size=1, description="Erase a sector"),
+            ])
+        self.sector = CSRStorage(description="Sector to erase",
+                                 fields=[
+                                     CSRField("sector", size=32, description="Sector to erase")
+                                 ])
+        self.status = CSRStatus(description="Interface status",
+                                fields=[
+                                    CSRField("wip", size=1, description="Operation in progress (write or erease)")
+                                ])
+        # TODO: implement ECC detailed register readback, CRC checking
+
+        # PHY machine mux --------------------------------------------------------------------------
+        # clk_en mux
+        spi_clk_en = Signal()
+        opi_clk_en = Signal()
+        self.sync += clk_en.eq(~self.spi_mode & opi_clk_en | self.spi_mode & spi_clk_en)
+        # tristate mux
+        self.sync += [
+            dq.oe.eq(~self.spi_mode & self.tx),
+            dq_mosi.oe.eq(self.spi_mode | self.tx),
+        ]
+        # data out mux (no data in mux, as we can just sample data in all the time without harm)
+        self.comb += do_mux_rise.eq(~self.spi_mode & do_rise[0] | self.spi_mode & self.mosi)
+        self.comb += do_mux_fall.eq(~self.spi_mode & do_fall[0] | self.spi_mode & self.mosi)
+
+        has_dummy = Signal()  # indicates if the current "req" requires dummy cycles to be appended (used for both OPI/SPI)
+        rom_addr = Signal(32,
+                          reset=0xFFFFFFFC)  # location of the internal ROM address pointer; reset to invalid address to force an address request on first read
+
+        # MAC/PHY abstraction for OPI
+        txphy_do = Signal(16)  # two sources of data out for OPI, one from the PHY, one from MAC
+        txcmd_do = Signal(16)
+        opi_di = Signal(16)
+
+        # internal machines
+        opi_addr = Signal(32)
+        opi_fifo_rd = Signal(32)
+        opi_fifo_wd = Signal(32)
+        opi_reset_rx_req = Signal()
+        opi_reset_rx_ack = Signal()
+        opi_rx_run = Signal()
+
+        rx_almostempty = Signal()
+        rx_almostfull = Signal()
+        rx_empty = Signal()
+        rx_full = Signal()
+        rx_rdcount = Signal(9)
+        rx_rderr = Signal()
+        rx_wrcount = Signal(9)
+        rx_wrerr = Signal()
+        rx_rden = Signal()
+        rx_wren = Signal(reset=1)
+        rx_fifo_rst = Signal()
+
+        wrendiv = Signal()
+        wrendiv2 = Signal()
+        self.specials += [
+            # this next pair of async-clear flip flops creates a write-enable gate that (a) ignores the first
+            # two DQS strobes (as they are pipe-filling) and (b) alternates with the correct phase so we are
+            # sampling 32-bit data into the FIFO.
+            Instance("FDCE", name="FDCE_WREN",
+                     i_C=dqs_iobuf, i_D=~wrendiv, o_Q=wrendiv, i_CE=1, i_CLR=~rx_wren,
+                     ),
+            Instance("FDCE", name="FDCE_WREN",
+                     i_C=dqs_iobuf, i_D=~wrendiv2, o_Q=wrendiv2, i_CE=wrendiv & ~wrendiv2, i_CLR=~rx_wren,
+                     ),
+            # Direct FIFO primitive is more resource-efficient and faster than migen primitive.
+            Instance("FIFO_DUALCLOCK_MACRO",
+                     p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, p_FIRST_WORD_FALL_THROUGH="TRUE",
+                     p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512 - (8 * prefetch_lines)),
+
+                     o_ALMOSTEMPTY=rx_almostempty, o_ALMOSTFULL=rx_almostfull,
+                     o_DO=opi_fifo_rd, o_EMPTY=rx_empty, o_FULL=rx_full,
+                     o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr,
+                     i_DI=opi_fifo_wd, i_RDCLK=ClockSignal(), i_RDEN=rx_rden,
+                     i_WRCLK=dqs_iobuf, i_WREN=wrendiv & wrendiv2, i_RST=rx_fifo_rst,
+                     )
+        ]
+        self.sync.dqs += opi_di.eq(self.di)
+        self.comb += opi_fifo_wd.eq(Cat(opi_di, self.di))
+
+        # ---------  OPI Rx Phy machine ------------------------------
+        self.submodules.rxphy = rxphy = FSM(reset_state="IDLE")
+        cti_pipe = Signal(3)
+        rxphy_cnt = Signal(3)
+        rxphy.act("IDLE",
+                  If(self.spi_mode,
+                     NextState("IDLE"),
+                     ).Else(
+                      NextValue(self.bus.ack, 0),
+                      If(opi_reset_rx_req,
+                         NextState("WAIT_RESET"),
+                         NextValue(rxphy_cnt, 6),
+                         NextValue(rx_wren, 0),
+                         NextValue(rx_fifo_rst, 1),
+                         ).Elif(opi_rx_run,
+                                NextValue(rx_wren, 1),
+                                If((self.bus.cyc & self.bus.stb & ~self.bus.we) & ((self.bus.cti == 2) |
+                                                                                   ((
+                                                                                                self.bus.cti == 7) & ~self.bus.ack)),
+                                   # handle case of non-pipelined read, ack is late
+                                   If(~rx_empty,
+                                      NextValue(self.bus.dat_r, opi_fifo_rd),
+                                      rx_rden.eq(1),
+                                      NextValue(opi_addr, opi_addr + 4),
+                                      NextValue(self.bus.ack, 1),
+                                      )
+                                   )
+                                )
+                  )
+                  )
+        rxphy.act("WAIT_RESET",
+                  NextValue(opi_addr, Cat(Signal(2), self.bus.adr)),
+                  NextValue(rxphy_cnt, rxphy_cnt - 1),
+                  If(rxphy_cnt == 0,
+                     NextValue(rx_fifo_rst, 0),
+                     opi_reset_rx_ack.eq(1),
+                     NextState("IDLE"),
+                     )
+                  )
+
+        # TxPHY machine: OPI -------------------------------------------------------------------------
+        txphy_cnt = Signal(4)
+        tx_run = Signal()
+        txphy_cs_n = Signal(reset=1)
+        txcmd_cs_n = Signal(reset=1)
+        txphy_clken = Signal()
+        txcmd_clken = Signal()
+        txphy_oe = Signal()
+        txcmd_oe = Signal()
+        self.sync += opi_cs_n.eq((tx_run & txphy_cs_n) | (~tx_run & txcmd_cs_n))
+        self.comb += If(tx_run, self.do.eq(txphy_do)).Else(self.do.eq(txcmd_do))
+        self.comb += opi_clk_en.eq((tx_run & txphy_clken) | (~tx_run & txcmd_clken))
+        self.comb += self.tx.eq((tx_run & txphy_oe) | (~tx_run & txcmd_oe))
+        tx_almostfull = Signal()
+        self.sync += tx_almostfull.eq(rx_almostfull)  # sync the rx_almostfull signal into the local clock domain
+        txphy_bus = Signal()
+        self.sync += txphy_bus.eq(self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))
+        tx_resetcycle = Signal()
+
+        self.submodules.txphy = txphy = FSM(reset_state="RESET")
+        txphy.act("RESET",
+                  NextValue(opi_rx_run, 0),
+                  NextValue(txphy_oe, 0),
+                  NextValue(txphy_cs_n, 1),
+                  NextValue(txphy_clken, 0),
+                  # guarantee that the first state we go to out of reset is a four-cycle burst
+                  NextValue(txphy_cnt, 4),
+                  If(tx_run & ~self.spi_mode, NextState("TX_SETUP"))
+                  )
+        txphy.act("TX_SETUP",
+                  NextValue(opi_rx_run, 0),
+                  NextValue(txphy_cnt, txphy_cnt - 1),
+                  If(txphy_cnt > 0,
+                     NextValue(txphy_cs_n, 1)
+                     ).Else(
+                      NextValue(txphy_cs_n, 0),
+                      NextValue(txphy_oe, 1),
+                      NextState("TX_CMD_CS_DELAY"),
+                  )
+                  )
+        txphy.act("TX_CMD_CS_DELAY",  # meet setup timing for CS-to-clock
+                  NextState("TX_CMD"),
+                  )
+        txphy.act("TX_CMD",
+                  NextValue(txphy_do, 0xEE11),
+                  NextValue(txphy_clken, 1),
+                  NextState("TX_ADRHI"),
+                  )
+        txphy.act("TX_ADRHI",
+                  NextValue(txphy_do, opi_addr[16:] & 0x07FF),  # mask off unused bits
+                  NextState("TX_ADRLO"),
+                  )
+        txphy.act("TX_ADRLO",
+                  NextValue(txphy_do, opi_addr[:16]),
+                  NextValue(txphy_cnt, self.config.fields.dummy - 1),
+                  NextState("TX_DUMMY"),
+                  )
+        txphy.act("TX_DUMMY",
+                  NextValue(txphy_oe, 0),
+                  NextValue(txphy_do, 0),
+                  NextValue(txphy_cnt, txphy_cnt - 1),
+                  If(txphy_cnt == 0,
+                     NextValue(opi_rx_run, 1),
+                     If(tx_resetcycle,
+                        NextValue(txphy_clken, 1),
+                        NextValue(opi_reset_rx_req, 1),
+                        NextState("TX_RESET_RX"),
+                        ).Else(
+                         NextState("TX_FILL"),
+                     )
+                     )
+                  )
+        txphy.act("TX_FILL",
+                  If(tx_run,
+                     If(((~txphy_bus & (self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))) &
+                         (opi_addr[2:] != self.bus.adr)) | tx_resetcycle,
+                        # it's a new bus cycle, and the requested address is not equal to the current read buffer address
+                        NextValue(txphy_clken, 1),
+                        NextValue(opi_reset_rx_req, 1),
+                        NextState("TX_RESET_RX"),
+                        ).Else(
+                         If(tx_almostfull & ~self.bus.ack,
+                            NextValue(txphy_clken, 0)
+                            ).Else(
+                             NextValue(txphy_clken, 1)
+                         )
+                     ),
+                     If(~(self.bus.cyc & self.bus.stb),
+                        NextValue(opi_rx_run, 0),
+                        ).Else(
+                         NextValue(opi_rx_run, 1),
+                     )
+                     ).Else(
+                      NextValue(txphy_clken, 0),
+                      NextState("RESET")
+                  )
+                  )
+        txphy.act("TX_RESET_RX",  # keep clocking the RX until it acknowledges a reset
+                  NextValue(opi_rx_run, 0),
+                  NextValue(opi_reset_rx_req, 0),
+                  If(opi_reset_rx_ack,
+                     NextValue(txphy_clken, 0),
+                     NextValue(txphy_cnt, 0),  # 1 cycle CS on back-to-back reads
+                     NextValue(txphy_cs_n, 1),
+                     NextState("TX_SETUP"),
+                     ).Else(
+                      NextValue(txphy_clken, 1),
+                  )
+                  )
+
+        # ---------  OPI CMD machine ------------------------------
+        self.submodules.opicmd = opicmd = FSM(reset_state="RESET")
+        opicmd.act("RESET",
+                   NextValue(txcmd_do, 0),
+                   NextValue(txcmd_oe, 0),
+                   NextValue(tx_run, 0),
+                   NextValue(txcmd_cs_n, 1),
+                   If(~self.spi_mode,
+                      NextState("IDLE"),
+                      ).Else(NextState("RESET_CYCLE")),
+                   )
+        opicmd.act("RESET_CYCLE",
+                   NextValue(txcmd_cs_n, 0),
+                   If(opi_reset_rx_ack,
+                      NextValue(tx_run, 1),
+                      NextState("IDLE"),
+                      ).Else(
+                       NextValue(tx_run, 1),
+                       tx_resetcycle.eq(1),
+                   )
+                   )
+        opicmd.act("IDLE",
+                   NextValue(txcmd_cs_n, 1),
+                   If(~self.spi_mode,  # this machine stays in idle once spi_mode is dropped
+                      ## The full form of this machine is as follows:
+                      # - First check if there is a CSR special command pending
+                      #   - if so, wait until the current bus cycle is done, then de-assert tx_run
+                      #   - then run the command
+                      # - Else wait until a bus cycle, and once it happens, put the system into run mode
+                      If(self.bus.cyc & self.bus.stb,
+                         If(~self.bus.we & (self.bus.cti == 2),
+                            NextState("TX_RUN")
+                            ).Else(
+                             # handle other cases here, e.g. what do we do if we get a write? probably
+                             # should just ACK it without doing anything so the CPU doesn't freeze...
+                         )
+                         ).Elif(self.command.re,
+                                NextState("DISPATCH_CMD"),
+                                )
+                      )
+                   )
+        opicmd.act("TX_RUN",
+                   NextValue(tx_run, 1),
+                   If(self.command.re,  # respond to commands
+                      NextState("WAIT_DISPATCH")
+                      )
+                   )
+        opicmd.act("WAIT_DISPATCH",  # wait until the current cycle is done, then stop TX and dispatch command
+                   If(~(self.bus.cyc & self.bus.stb),
+                      NextValue(tx_run, 0),
+                      NextState("DISPATCH_CMD")
+                      )
+                   )
+        opicmd.act("DISPATCH_CMD",
+                   If(self.command.fields.sector_erase,
+                      NextState("DO_SECTOR_ERASE"),
+                      ).Else(
+                       NextState("IDLE"),
+                   )
+                   )
+        opicmd.act("DO_SECTOR_ERASE",
+                   # placeholder
+                   )
+
+        # MAC/PHY abstraction for the SPI machine
+        spi_req = Signal()
+        spi_ack = Signal()
+        spi_do = Signal(8)  # this is the API to the machine
+        spi_di = Signal(8)
+
+        # PHY machine: SPI -------------------------------------------------------------------------
+
+        # internal signals are:
+        # selection - self.spi_mode
+        # OPI - self.do(16), self.di(16), self.tx
+        # SPI - self.mosi, self.miso
+        # cs_n - both
+        # ecs_n - OPI
+        # clk_en - both
+
+        spicount = Signal(5)
+        spi_so = Signal(8)  # this internal to the machine
+        spi_si = Signal(8)
+        spi_dummy = Signal()
+        spi_di_load = Signal()  # spi_do load is pipelined back one cycle using this mechanism
+        spi_di_load2 = Signal()
+        spi_ack_pipe = Signal()
+        self.sync += [
+            # pipelining is required the MISO path is very slow (IOB->fabric FD), and a falling-edge retiming reg is used to meet timing
+            spi_di_load2.eq(spi_di_load),
+            If(spi_di_load2, spi_di.eq(Cat(self.miso, spi_si[:-1]))).Else(spi_di.eq(spi_di)),
+            spi_ack.eq(spi_ack_pipe),
+        ]
+        self.comb += self.mosi.eq(spi_so[7])
+        self.sync += spi_si.eq(Cat(self.miso, spi_si[:-1]))
+        self.submodules.spiphy = spiphy = FSM(reset_state="RESET")
+        spiphy.act("RESET",
+                   If(spi_req,
+                      NextState("REQ"),
+                      NextValue(spicount, 7),
+                      NextValue(spi_clk_en, 1),
+                      NextValue(spi_so, spi_do),
+                      NextValue(spi_dummy, has_dummy),
+                      ).Else(
+                       NextValue(spi_clk_en, 0),
+                       NextValue(spi_ack_pipe, 0),
+                       NextValue(spicount, 0),
+                       NextValue(spi_dummy, 0),
+                   )
+                   )
+        spiphy.act("REQ",
+                   If(spicount > 0,
+                      NextValue(spicount, spicount - 1),
+                      NextValue(spi_clk_en, 1),
+                      NextValue(spi_so, Cat(0, spi_so[:-1])),
+                      NextValue(spi_ack_pipe, 0),
+                      ).Elif((spicount == 0) & spi_req & ~spi_dummy,  # back-to-back transaction
+                             NextValue(spi_clk_en, 1),
+                             NextValue(spicount, 7),
+                             NextValue(spi_clk_en, 1),
+                             NextValue(spi_so, spi_do),  # reload the so register
+                             spi_di_load.eq(1),  # "naked" .eq() create single-cycle pulses that default back to 0
+                             NextValue(spi_ack_pipe, 1),
+                             NextValue(spi_dummy, has_dummy),
+                             ).Elif((spicount == 0) & ~spi_req & ~spi_dummy,  # go back to idle
+                                    spi_di_load.eq(1),
+                                    NextValue(spi_ack_pipe, 1),
+                                    NextValue(spi_clk_en, 0),
+                                    NextState("RESET"),
+                                    ).Elif((spicount == 0) & spi_dummy,
+                                           spi_di_load.eq(1),
+                                           NextValue(spicount, self.config.fields.dummy),
+                                           NextValue(spi_clk_en, 1),
+                                           NextValue(spi_ack_pipe, 0),
+                                           NextValue(spi_so, 0),  # do a dummy with '0' as the output
+                                           NextState("DUMMY"),
+                                           )  # this actually should be a fully defined situation, no "Else" applicable
+                   )
+        spiphy.act("DUMMY",
+                   If(spicount > 1,  # instead of doing dummy-1, we stop at count == 1
+                      NextValue(spicount, spicount - 1),
+                      NextValue(spi_clk_en, 1),
+                      ).Elif(spicount <= 1 & spi_req,
+                             NextValue(spi_clk_en, 1),
+                             NextValue(spicount, 7),
+                             NextValue(spi_so, spi_do),  # reload the so register
+                             NextValue(spi_ack_pipe, 1),  # finally ack the cycle
+                             NextValue(spi_dummy, has_dummy),
+                             ).Else(
+                       NextValue(spi_clk_en, 0),
+                       NextValue(spi_ack_pipe, 1),  # finally ack the cycle
+                       NextState("RESET")
+                   )
+                   )
+
+        # SPI MAC machine -------------------------------------------------------------------------------
+        # default active on boot
+        addr_updated = Signal()
+        d_to_wb = Signal(32)  # data going back to wishbone
+        mac_count = Signal(5)
+        new_cycle = Signal(1)
+        self.submodules.mac = mac = FSM(reset_state="RESET")
+        mac.act("RESET",
+                NextValue(self.spi_mode, 1),
+                NextValue(addr_updated, 0),
+                NextValue(d_to_wb, 0),
+                NextValue(spi_cs_n, 1),
+                NextValue(has_dummy, 0),
+                NextValue(spi_do, 0),
+                NextValue(spi_req, 0),
+                NextValue(mac_count, 0),
+                NextState("WAKEUP_PRE"),
+                NextValue(new_cycle, 1),
+                If(self.spi_mode, NextValue(self.bus.ack, 0)),
+                )
+        if spiread:
+            mac.act("IDLE",
+                    If(self.spi_mode,  # this machine stays in idle once spi_mode is dropped
+                       NextValue(self.bus.ack, 0),
+                       If((self.bus.cyc == 1) & (self.bus.stb == 1) & (self.bus.we == 0) & (self.bus.cti != 7),
+                          # read cycle requested, not end-of-burst
+                          If((rom_addr[2:] != self.bus.adr) & new_cycle,
+                             NextValue(rom_addr, Cat(Signal(2, reset=0), self.bus.adr)),
+                             NextValue(addr_updated, 1),
+                             NextValue(spi_cs_n, 1),  # raise CS in anticipation of a new address cycle
+                             NextState("SPI_READ_32_CS"),
+                             ).Elif((rom_addr[2:] == self.bus.adr) | (~new_cycle & self.bus.cti == 2),
+                                    NextValue(mac_count, 3),  # get another beat of 4 bytes at the next address
+                                    NextState("SPI_READ_32")
+                                    ).Else(
+                              NextValue(addr_updated, 0),
+                              NextValue(spi_cs_n, 0),
+                              NextState("SPI_READ_32"),
+                              NextValue(mac_count, 3),  # prep the MAC state counter to count out 4 bytes
+                          ),
+                          ).Elif(self.command.fields.wakeup,
+                                 NextValue(spi_cs_n, 1),
+                                 NextValue(self.command.storage, 0),  # clear all pending commands
+                                 NextState("WAKEUP_PRE"),
+                                 )
+                       )
+                    )
+        else:
+            mac.act("IDLE",
+                    If(self.spi_mode,  # this machine stays in idle once spi_mode is dropped
+                       If(self.command.fields.wakeup,
+                          NextValue(spi_cs_n, 1),
+                          NextValue(self.command.storage, 0),  # clear all pending commands
+                          NextState("WAKEUP_PRE"),
+                          )
+                       )
+                    )
+
+        # ---------  wakup chip ------------------------------
+        mac.act("WAKEUP_PRE",
+                NextValue(spi_cs_n, 1),  # why isn't this sticking? i shouldn't have to put this here
+                NextValue(mac_count, 4),
+                NextState("WAKEUP_PRE_CS_WAIT")
+                )
+        mac.act("WAKEUP_PRE_CS_WAIT",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextState("WAKEUP_WUP"),
+                   NextValue(spi_cs_n, 0),
+                   )
+                )
+        mac.act("WAKEUP_WUP",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextValue(spi_cs_n, 0),
+                   NextValue(spi_do, 0xab),  # wakeup from deep sleep
+                   NextValue(spi_req, 1),
+                   NextState("WAKEUP_WUP_WAIT"),
+                   )
+                )
+        mac.act("WAKEUP_WUP_WAIT",
+                NextValue(spi_req, 0),
+                If(spi_ack,
+                   NextValue(spi_cs_n, 1),  # raise CS
+                   NextValue(mac_count, 4),  # for >4 cycles per specsheet
+                   NextState("WAKEUP_CR2_WREN_1")
+                   )
+                )
+
+        # ---------  WREN+CR2 - dummy cycles ------------------------------
+        mac.act("WAKEUP_CR2_WREN_1",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextValue(spi_cs_n, 0),
+                   NextValue(spi_do, 0x06),  # WREN to unlock CR2 writing
+                   NextValue(spi_req, 1),
+                   NextState("WAKEUP_CR2_WREN_1_WAIT"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_WREN_1_WAIT",
+                NextValue(spi_req, 0),
+                If(spi_ack,
+                   NextValue(spi_cs_n, 1),
+                   NextValue(mac_count, 4),
+                   NextState("WAKEUP_CR2_DUMMY_CMD"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_DUMMY_CMD",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextValue(spi_cs_n, 0),
+                   NextValue(spi_do, 0x72),  # CR2 command
+                   NextValue(spi_req, 1),
+                   NextValue(mac_count, 2),
+                   NextState("WAKEUP_CR2_DUMMY_ADRHI"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_DUMMY_ADRHI",
+                NextValue(spi_do, 0x00),  # we want to send 00_00_03_00
+                If(spi_ack,
+                   NextValue(mac_count, mac_count - 1),
+                   ),
+                If(mac_count == 0,
+                   NextState("WAKEUP_CR2_DUMMY_ADRMID")
+                   )
+                )
+        mac.act("WAKEUP_CR2_DUMMY_ADRMID",
+                NextValue(spi_do, 0x03),
+                If(spi_ack, NextState("WAKEUP_CR2_DUMMY_ADRLO")),
+                )
+        mac.act("WAKEUP_CR2_DUMMY_ADRLO",
+                NextValue(spi_do, 0x00),
+                If(spi_ack, NextState("WAKEUP_CR2_DUMMY_DATA")),
+                )
+        mac.act("WAKEUP_CR2_DUMMY_DATA",
+                NextValue(spi_do, 0x05),  # 10 dummy cycles as required for 84MHz-104MHz operation
+                If(spi_ack, NextState("WAKEUP_CR2_DUMMY_WAIT")),
+                )
+        mac.act("WAKEUP_CR2_DUMMY_WAIT",
+                NextValue(spi_req, 0),
+                If(spi_ack,
+                   NextValue(spi_cs_n, 1),
+                   NextValue(mac_count, 4),
+                   NextState("WAKEUP_CR2_WREN_2")
+                   )
+                )
+
+        # ---------  WREN+CR2 to DOPI mode ------------------------------
+        mac.act("WAKEUP_CR2_WREN_2",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextValue(spi_cs_n, 0),
+                   NextValue(spi_do, 0x06),  # WREN to unlock CR2 writing
+                   NextValue(spi_req, 1),
+                   NextState("WAKEUP_CR2_WREN_2_WAIT"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_WREN_2_WAIT",
+                NextValue(spi_req, 0),
+                If(spi_ack,
+                   NextValue(spi_cs_n, 1),
+                   NextValue(mac_count, 4),
+                   NextState("WAKEUP_CR2_DOPI_CMD"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_DOPI_CMD",
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextValue(spi_cs_n, 0),
+                   NextValue(spi_do, 0x72),  # CR2 command
+                   NextValue(spi_req, 1),
+                   NextValue(mac_count, 4),
+                   NextState("WAKEUP_CR2_DOPI_ADR"),
+                   )
+                )
+        mac.act("WAKEUP_CR2_DOPI_ADR",  # send 0x00_00_00_00 as address
+                NextValue(spi_do, 0x00),  # no need to raise CS or lower spi_req, this is back-to-back
+                If(spi_ack,
+                   NextValue(mac_count, mac_count - 1),
+                   ),
+                If(mac_count == 0,
+                   NextState("WAKEUP_CR2_DOPI_DATA"),
+                   )
+                ),
+        mac.act("WAKEUP_CR2_DOPI_DATA",
+                NextValue(spi_do, 2),  # enable DOPI mode
+                If(spi_ack, NextState("WAKEUP_CR2_DOPI_WAIT")),
+                )
+        mac.act("WAKEUP_CR2_DOPI_WAIT",  # trailing CS wait
+                NextValue(spi_req, 0),
+                If(spi_ack,
+                   NextValue(spi_cs_n, 1),
+                   NextValue(mac_count, 4),
+                   NextState("WAKEUP_CS_EXIT")
+                   )
+                )
+        mac.act("WAKEUP_CS_EXIT",
+                NextValue(self.spi_mode, 0),  # now enter DOPI mode
+                NextValue(mac_count, mac_count - 1),
+                If(mac_count == 0,
+                   NextState("IDLE"),
+                   )
+                )
+
+        if spiread:
+            # ---------  SPI read machine ------------------------------
+            mac.act("SPI_READ_32",
+                    If(addr_updated,
+                       NextState("SPI_READ_32_CS"),
+                       NextValue(has_dummy, 0),
+                       NextValue(mac_count, 3),
+                       NextValue(spi_cs_n, 1),
+                       NextValue(spi_req, 0),
+                       ).Else(
+                        If(mac_count > 0,
+                           NextValue(has_dummy, 0),
+                           NextValue(spi_req, 1),
+                           NextState("SPI_READ_32_D")
+                           ).Else(
+                            NextValue(spi_req, 0),
+                            If(spi_ack,
+                               If(self.spi_mode,
+                                  # protect these in a spi_mode mux to prevent excess inference of logic to handle otherwise implicit dual-master situation
+                                  NextValue(self.bus.dat_r, Cat(d_to_wb[8:], spi_di)),
+                                  NextValue(self.bus.ack, 1),
+                                  ),
+                               NextValue(rom_addr, rom_addr + 1),
+                               NextState("IDLE")
+                               )
+                        )
+                    )
+                    )
+            mac.act("SPI_READ_32_D",
+                    If(spi_ack,
+                       # shift in one byte at a time to d_to_wb(32)
+                       NextValue(d_to_wb, Cat(d_to_wb[8:], spi_di, )),
+                       NextValue(mac_count, mac_count - 1),
+                       NextState("SPI_READ_32"),
+                       NextValue(rom_addr, rom_addr + 1),
+                       )
+                    )
+            mac.act("SPI_READ_32_CS",
+                    NextValue(mac_count, mac_count - 1),
+                    If(mac_count == 0,
+                       NextValue(spi_cs_n, 0),
+                       NextState("SPI_READ_32_A0"),
+                       )
+                    )
+            mac.act("SPI_READ_32_A0",
+                    NextValue(spi_do, 0x0c),  # 32-bit address write for "fast read" command
+                    NextValue(spi_req, 1),
+                    NextState("SPI_READ_32_A1"),
+                    )
+            mac.act("SPI_READ_32_A1",
+                    NextValue(spi_do, rom_addr[24:] & 0x7),
+                    # queue up MSB to send, leave req high; mask off unused high bits
+                    If(spi_ack,
+                       NextState("SPI_READ_32_A2"),
+                       )
+                    )
+            mac.act("SPI_READ_32_A2",
+                    NextValue(spi_do, rom_addr[16:24]),
+                    If(spi_ack,
+                       NextState("SPI_READ_32_A3"),
+                       )
+                    )
+            mac.act("SPI_READ_32_A3",
+                    NextValue(spi_do, rom_addr[8:16]),
+                    If(spi_ack,
+                       NextState("SPI_READ_32_A4"),
+                       )
+                    )
+            mac.act("SPI_READ_32_A4",
+                    NextValue(spi_do, rom_addr[:8]),
+                    If(spi_ack,
+                       NextState("SPI_READ_32_A5"),
+                       )
+                    )
+            mac.act("SPI_READ_32_A5",
+                    NextValue(spi_do, 0),
+                    If(spi_ack,
+                       NextState("SPI_READ_32_DUMMY")
+                       )
+                    )
+            mac.act("SPI_READ_32_DUMMY",
+                    NextValue(spi_req, 0),
+                    NextValue(addr_updated, 0),
+                    If(spi_ack,
+                       NextState("SPI_READ_32"),
+                       NextValue(mac_count, 3),  # prep the MAC state counter to count out 4 bytes
+                       ).Else(
+                        NextState("SPI_READ_32_DUMMY")
+                    )
+                    )
+
+        # Handle ECS_n -----------------------------------------------------------------------------
+        # treat ECS_N as an async signal -- just a "rough guide" of problems
+        ecs_n = Signal()
+        self.specials += MultiReg(pads.ecs_n, ecs_n)
+
+        self.submodules.ev = EventManager()
+        self.ev.ecc_error = EventSourceProcess()  # Falling edge triggered
+        self.ev.finalize()
+        self.comb += self.ev.ecc_error.trigger.eq(ecs_n)
+        ecc_reported = Signal()
+        ecs_n_delay = Signal()
+        ecs_pulse = Signal()
+
+        self.ecc_address = CSRStatus(fields=[
+            CSRField("ecc_address", size=32, description="Address of the most recent ECC event")
+        ])
+        self.ecc_status = CSRStatus(fields=[
+            CSRField("ecc_error", size=1,
+                     description="Live status of the ECS_N bit (ECC error on current packet when low)"),
+            CSRField("ecc_overflow", size=1,
+                     description="More than one ECS_N event has happened since th last time ecc_address was checked")
+        ])
+
+        self.comb += self.ecc_status.fields.ecc_error.eq(ecs_n)
+        self.comb += [
+            ecs_pulse.eq(ecs_n_delay & ~ecs_n),  # falling edge -> positive pulse
+            If(ecs_pulse,
+               self.ecc_address.fields.ecc_address.eq(rom_addr),
+               If(ecc_reported,
+                  self.ecc_status.fields.ecc_overflow.eq(1)
+                  ).Else(
+                   self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow),
+               )
+               ).Else(
+                self.ecc_address.fields.ecc_address.eq(self.ecc_address.fields.ecc_address),
+                If(self.ecc_status.we,
+                   self.ecc_status.fields.ecc_overflow.eq(0),
+                   ).Else(
+                    self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow),
+                )
+            )
+        ]
+        self.sync += [
+            ecs_n_delay.eq(ecs_n),
+            If(ecs_pulse,
+               ecc_reported.eq(1)
+               ).Elif(self.ecc_address.we,
+                      ecc_reported.eq(0)
+                      )
+        ]

From 33d9e45a8bf2f34381e748fad81a9916b18d1b15 Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Thu, 6 Feb 2020 17:54:26 +0800
Subject: [PATCH 2/5] fix formatting on spiopi

Pycharm really butchered the code when I did a copy-and-paste...
it has questionable default formatting preferences.
---
 litex/soc/cores/spiopi.py | 663 +++++++++++++++++++-------------------
 1 file changed, 323 insertions(+), 340 deletions(-)

diff --git a/litex/soc/cores/spiopi.py b/litex/soc/cores/spiopi.py
index 941d65c62..12a05d71d 100644
--- a/litex/soc/cores/spiopi.py
+++ b/litex/soc/cores/spiopi.py
@@ -1,6 +1,3 @@
-# This file is Copyright (c) 2020 bunnie <bunnie@kosagi.com>
-# License: BSD
-
 from litex.soc.interconnect.csr_eventmanager import *
 from litex.soc.interconnect import wishbone
 from litex.soc.integration.doc import AutoDoc, ModuleDoc
@@ -12,13 +9,13 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.intro = ModuleDoc("""
         Intro
         ********
-
+        
         SpiOpi implements a dual-mode SPI or OPI interface. OPI is an octal (8-bit) wide
         variant of SPI, which is unique to Macronix parts. It is concurrently interoperable
         with SPI. The chip supports "DTR mode" (double transfer rate, e.g. DDR) where data
         is transferred on each edge of the clock, and there is a source-synchronous DQS
         associated with the input data.
-
+        
         The chip by default boots into SPI-only mode (unless NV bits are burned otherwise)
         so to enable OPI, a config register needs to be written with SPI mode. Note that once
         the config register is written, the only way to return to SPI mode is to change
@@ -27,29 +24,29 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         and so the SPI ROM will be in DOPI, and SPI loading will fail. Thus, system architects
         must take into consideration a hard reset for the ROM whenever a bitstream reload
         is demanded of the FPGA. 
-
+        
         The SpiOpi architecture is split into two levels: a command manager, and a
         cycle manager. The command manager is responsible for taking the current wishbone
         request and CSR state and unpacking these into cycle-by-cycle requests. The cycle
         manager is responsible for coordinating the cycle-by-cycle requests. 
-
+        
         In SPI mode, this means marshalling byte-wide requests into a series of 8 serial cyles.
-
+        
         In OPI [DOPI] mode, this means marshalling 16-bit wide requests into a pair of back-to-back DDR
         cycles. Note that because the cycles are DDR, this means one 16-bit wide request must be
         issued every cycle to keep up with the interface. 
-
+        
         For the output of data to ROM, expects a clock called "spinor_delayed" which is a delayed 
         version of "sys". The delay is necessary to get the correct phase relationship between 
         the SIO and SCLK in DTR/DDR mode, and it also has to compensate for the special-case
         difference in the CCLK pad vs other I/O.
-
+        
         For the input, DQS signal is independently delayed relative to the DQ signals using
         an IDELAYE2 block. At a REFCLK frequency of 200 MHz, each delay tap adds 78ps, so up
         to a 2.418ns delay is possible between DQS and DQ. The goal is to delay DQS relative
         to DQ, because the SPI chip launches both with concurrent rising edges (to within 0.6ns),
         but the IDDR register needs the rising edge of DQS to be centered inside the DQ eye.
-
+        
         In DOPI mode, there is a prefetch buffer. It will read `prefetch_lines` cache lines of 
         data into the prefetch buffer. A cache line is 256 bits (or 8x32-bit words). The maximum 
         value is 63 lines (one line is necessary for synchronization margin). The downside of
@@ -60,26 +57,26 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         1-3 lines read-ahead of the CPU. Any higher than 3 lines probably just wastes power.
         In short simulations, 1 line of prefetch seems to be enough to keep the prefetcher
         ahead of the CPU even when it's simply running straight-line code.
-
+        
         Note the "sim" parameter exists because there seems to be a bug in xvlog that doesn't
         correctly simulate the IDELAY machines. Setting "sim" to True removes the IDELAY machines
         and passes the data through directly, but in real hardware the IDELAY machines are
         necessary to meet timing between DQS and DQ.  
-
+        
         dq_delay_taps probably doesn't need to be adjusted; it can be tweaked for timing
         closure. The delays can also be adjusted at runtime.
         """)
         if prefetch_lines > 63:
             prefetch_lines = 63
 
-        self.spi_mode = Signal(reset=1)  # when reset is asserted, force into spi mode
-        cs_n = Signal(reset=1)  # make sure CS is sane on reset, too
+        self.spi_mode = Signal(reset=1) # when reset is asserted, force into spi mode
+        cs_n = Signal(reset=1) # make sure CS is sane on reset, too
 
         self.config = CSRStorage(fields=[
             CSRField("dummy", size=5, description="Number of dummy cycles", reset=10),
         ])
 
-        delay_type = "VAR_LOAD"
+        delay_type="VAR_LOAD"
 
         # DQS input conditioning -----------------------------------------------------------------
         dqs_iobuf = Signal()
@@ -89,14 +86,15 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             Instance("BUFR", i_I=pads.dqs, o_O=dqs_iobuf),
         ]
 
+
         # DQ connections -------------------------------------------------------------------------
         # PHY API
-        self.do = Signal(16)  # OPI data to SPI
-        self.di = Signal(16)  # OPI data from SPI
-        self.tx = Signal()  # when asserted OPI is transmitting data to SPI, otherwise, receiving
+        self.do = Signal(16) # OPI data to SPI
+        self.di = Signal(16) # OPI data from SPI
+        self.tx = Signal() # when asserted OPI is transmitting data to SPI, otherwise, receiving
 
-        self.mosi = Signal()  # SPI data to SPI
-        self.miso = Signal()  # SPI data from SPI
+        self.mosi = Signal() # SPI data to SPI
+        self.miso = Signal() # SPI data from SPI
 
         # Delay programming API
         self.delay_config = CSRStorage(fields=[
@@ -111,8 +109,8 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.sync += self.delay_update.eq(self.hw_delay_load | self.delay_config.fields.load)
 
         # Break system API into rising/falling edge samples
-        do_rise = Signal(8)  # data output presented on the rising edge
-        do_fall = Signal(8)  # data output presented on the falling edge
+        do_rise = Signal(8) # data output presented on the rising edge
+        do_fall = Signal(8) # data output presented on the falling edge
         self.comb += [do_rise.eq(self.do[8:]), do_fall.eq(self.do[:8])]
 
         di_rise = Signal(8)
@@ -120,88 +118,81 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.comb += self.di.eq(Cat(di_fall, di_rise))
 
         # OPI DDR registers
-        dq = TSTriple(7)  # dq[0] is special because it is also MOSI
+        dq = TSTriple(7) # dq[0] is special because it is also MOSI
         dq_delayed = Signal(8)
         self.specials += dq.get_tristate(pads.dq[1:])
         for i in range(1, 8):
             self.specials += Instance("ODDR",
-                                      p_DDR_CLK_EDGE="SAME_EDGE",
-                                      i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
-                                      i_D1=do_rise[i], i_D2=do_fall[i], o_Q=dq.o[i - 1],
-                                      )
+                p_DDR_CLK_EDGE="SAME_EDGE",
+                i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
+                i_D1=do_rise[i], i_D2=do_fall[i], o_Q=dq.o[i-1],
+            )
             if sim == False:
-                if i == 1:  # only wire up o_CNTVALUEOUT for one instance
+                if i == 1: # only wire up o_CNTVALUEOUT for one instance
                     self.specials += Instance("IDELAYE2",
-                                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE",
-                                              p_REFCLK_FREQUENCY=200.0,
-                                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps,
-                                              p_IDELAY_TYPE=delay_type,
+                             p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                             p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                             p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0,
-                                              i_CE=0,
-                                              i_LD=self.delay_update,
-                                              i_CNTVALUEIN=self.delay_config.fields.d,
-                                              o_CNTVALUEOUT=self.delay_status.fields.q,
-                                              i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i],
-                                              ),
-                else:  # don't wire up o_CNTVALUEOUT for others
+                             i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                             i_LD=self.delay_update,
+                             i_CNTVALUEIN=self.delay_config.fields.d, o_CNTVALUEOUT=self.delay_status.fields.q,
+                             i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
+                    ),
+                else: # don't wire up o_CNTVALUEOUT for others
                     self.specials += Instance("IDELAYE2",
-                                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE",
-                                              p_REFCLK_FREQUENCY=200.0,
-                                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps,
-                                              p_IDELAY_TYPE=delay_type,
+                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0,
-                                              i_CE=0,
-                                              i_LD=self.delay_update,
-                                              i_CNTVALUEIN=self.delay_config.fields.d,
-                                              i_IDATAIN=dq.i[i - 1], o_DATAOUT=dq_delayed[i],
-                                              ),
+                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                              i_LD=self.delay_update,
+                              i_CNTVALUEIN=self.delay_config.fields.d,
+                              i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
+                  ),
             else:
-                self.comb += dq_delayed[i].eq(dq.i[i - 1])
+                self.comb += dq_delayed[i].eq(dq.i[i-1])
             self.specials += Instance("IDDR", name="{}{}".format(iddr_name, str(i)),
-                                      p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
-                                      i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
-                                      i_D=dq_delayed[i], o_Q1=di_rise[i], o_Q2=di_fall[i],
-                                      )
+                p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
+                i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
+                i_D=dq_delayed[i], o_Q1=di_rise[i], o_Q2=di_fall[i],
+            )
         # SPI SDR register
         self.specials += [
             Instance("FDRE", name="{}".format(miso_name), i_C=~ClockSignal("spinor"), i_CE=1, i_R=0, o_Q=self.miso,
                      i_D=dq_delayed[1],
-                     )
+            )
         ]
 
         # bit 0 (MOSI) is special-cased to handle SPI mode
-        dq_mosi = TSTriple(1)  # this has similar structure but an independent "oe" signal
+        dq_mosi = TSTriple(1) # this has similar structure but an independent "oe" signal
         self.specials += dq_mosi.get_tristate(pads.dq[0])
-        do_mux_rise = Signal()  # mux signal for mosi/dq select of bit 0
+        do_mux_rise = Signal() # mux signal for mosi/dq select of bit 0
         do_mux_fall = Signal()
         self.specials += [
             Instance("ODDR",
-                     p_DDR_CLK_EDGE="SAME_EDGE",
-                     i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
-                     i_D1=do_mux_rise, i_D2=do_mux_fall, o_Q=dq_mosi.o,
-                     ),
+              p_DDR_CLK_EDGE="SAME_EDGE",
+              i_C=ClockSignal(), i_R=ResetSignal(), i_S=0, i_CE=1,
+              i_D1=do_mux_rise, i_D2=do_mux_fall, o_Q=dq_mosi.o,
+            ),
             Instance("IDDR",
-                     p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
-                     i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
-                     o_Q1=di_rise[0], o_Q2=di_fall[0], i_D=dq_delayed[0],
-                     ),
+              p_DDR_CLK_EDGE="SAME_EDGE_PIPELINED",
+              i_C=dqs_iobuf, i_R=ResetSignal(), i_S=0, i_CE=1,
+              o_Q1=di_rise[0], o_Q2=di_fall[0], i_D=dq_delayed[0],
+            ),
         ]
         if sim == False:
             self.specials += [
-                Instance("IDELAYE2",
-                         p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                         p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
-                         p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
+            Instance("IDELAYE2",
+                     p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                     p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                     p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                         i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
-                         i_LD=self.delay_update,
-                         i_CNTVALUEIN=self.delay_config.fields.d,
-                         i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0],
-                         ),
+                     i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                     i_LD=self.delay_update,
+                     i_CNTVALUEIN=self.delay_config.fields.d,
+                     i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0],
+                     ),
             ]
         else:
             self.comb += dq_delayed[0].eq(dq_mosi.i)
@@ -214,7 +205,7 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                      i_CLK=0, i_GSR=0, i_GTS=0, i_KEYCLEARB=0, i_PACK=0, i_USRDONEO=1, i_USRDONETS=1,
                      i_USRCCLKO=0, i_USRCCLKTS=1,  # force to tristate
                      ),
-            Instance("ODDR", name=sclk_name,  # need to name this so we can constrain it properly
+            Instance("ODDR", name=sclk_name, # need to name this so we can constrain it properly
                      p_DDR_CLK_EDGE="SAME_EDGE",
                      i_C=ClockSignal("spinor"), i_R=ResetSignal("spinor"), i_S=0, i_CE=1,
                      i_D1=clk_en, i_D2=0, o_Q=pads.sclk,
@@ -224,25 +215,25 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         # wire up CS_N
         spi_cs_n = Signal()
         opi_cs_n = Signal()
-        self.comb += cs_n.eq((self.spi_mode & spi_cs_n) | (~self.spi_mode & opi_cs_n))
+        self.comb += cs_n.eq( (self.spi_mode & spi_cs_n) | (~self.spi_mode & opi_cs_n) )
         self.specials += [
             Instance("ODDR",
-                     p_DDR_CLK_EDGE="SAME_EDGE",
-                     i_C=ClockSignal(), i_R=0, i_S=ResetSignal(), i_CE=1,
-                     i_D1=cs_n, i_D2=cs_n, o_Q=pads.cs_n,
-                     ),
+              p_DDR_CLK_EDGE="SAME_EDGE",
+              i_C=ClockSignal(), i_R=0, i_S=ResetSignal(), i_CE=1,
+              i_D1=cs_n, i_D2=cs_n, o_Q=pads.cs_n,
+            ),
         ]
 
         self.architecture = ModuleDoc("""
         Architecture
         **************
-
+        
         The machine is split into two separate pieces, one to handle SPI, and one to handle OPI.
-
+        
         SPI
         =====
         The SPI machine architecture is split into two levels: MAC and PHY. 
-
+        
         The MAC layer is responsible for:
         - receiving requests via CSR register to perform config/status/special command sequences,
         and dispatching these to the SPI PHY
@@ -251,41 +242,41 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         - managing the chip select to the chip, and ensuring that one dummy cycle is inserted after
         chip select is asserted, or before it is de-asserted; and that the chip select "high" times
         are adequate (1 cycle between reads, 4 cycles for all other operations)
-
+        
         On boot, the interface runs in SPI; once the wakeup sequence is executed, the chip permanently
         switches to OPI mode unless the CR2 registers are written to fall back, or the 
         reset to the chip is asserted.
-
+          
         The PHY layers are responsible for the following tasks:
         - Serializing and deserializing data, standardized on 8 bits for SPI and 16 bits for OPI
         - counting dummy cycles
         - managing the clock enable
-
+        
         PHY cycles are initiated with a "req" signal, which is only sampled for 
         one cycle and then ignored until the PHY issues an "ack" that the current cycle is complete. 
         Thus holding "req" high can allow the PHY to back-to-back issue cycles without pause.
-
+        
         OPI
         =====
         The OPI machine is split into three parts: a command controller, a Tx PHY, and an Rx PHY. 
-
+        
         The Tx PHY is configured with a "dummy cycle" count register, as there is a variable length
         delay for dummy cycles in OPI.
-
+        
         In OPI mode, read data is `mesochronous`, that is, they return at precisely the same frequency
         as SCLK, but with an unknown phase relationship. The DQS strobe is provided as a "hint" to
         the receiving side to help retime the data. The mesochronous nature of the read data is
         why the Tx and Rx PHY must be split into two separate machines, as they are operating in
         different clock domains.
-
+        
         DQS is implemented on the ROM as an extra data output that is guaranteed to change polarity with 
         each data byte; the skew mismatch of DQS to data is within +/-0.6ns or so. It turns out the mere
         act of routing the DQS into a BUFR buffer before clocking the data into an IDDR primitive
         is sufficient to delay the DQS signal and meet setup and hold time on the IDDR. 
-
+        
         Once captured by the IDDR, the data is fed into a dual-clock FIFO to make the transition
         from the DQS to sysclk domains cleanly. 
-
+        
         Because of the latency involved in going from pin->IDDR->FIFO, excess read cycles are
         required beyond the end of the requested cache line. However, there is virtually no 
         penalty in pre-filling the FIFO with data; if a new cache line has to be fetched, 
@@ -295,24 +286,24 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         (as opposed to 49 bus cycles for random reads). Either way, this compares favorably to
         288 cycles for random reads in 100MHz SPI mode (or 576 for the spimemio.v, which runs at 
         50MHz).   
-
+        
         The command controller is repsonsible for sequencing all commands other than fast reads. Most
         commands have some special-case structure to them, and as more commands are implemented, the
         state machine is expected to grow fairly large. Fast reads are directly handled in "tx_run"
         mode, where the TxPhy and RxPhy run a tight loop to watch incoming read bus cycles, check
         the current address, fill the prefetch fifo, and respond to bus cycles. 
-
+        
         Writes to ROM might lock up the machine; a TODO is to test this and do something more sane,
         like ignore writes by sending an ACK immediately while discarding the data.
-
+        
         Thus, an OPI read proceeds as follows:
-
+        
         - When BUS/STB are asserted:
            TxPhy:
-
+           
            - capture bus_adr, and compare against the *next read* address pointer
               - if they match, allow the PHYs to do the work
-
+           
            - if bus_adr and next read address don't match, save to next read address pointer, and 
              cycle wr/rd clk for 5 cycle while asserting reset to reset the FIFO
            - initiate an 8DTRD with the read address pointer
@@ -321,13 +312,13 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
            - greedily pre-fill the FIFO by continuing to clock DQS until either:
              - the FIFO is full
              - pre-fetch is aborted because bus_adr and next read address don't match and FIFO is reset
-
+        
            RxPHY:
            - while CTI==2, assemble data into 32-bit words as soon as EMPTY is deasserted,
              present a bus_ack, and increment the next read address pointer
            - when CTI==7, ack the data, and wait until the next bus cycle with CTI==2 to resume
              reading
-
+        
         - A FIFO_SYNC_MACRO is used to instantiate the FIFO. This is chosen because:
            - we can specify RAMB18's, which seem to be under-utilized by the auto-inferred memories by migen
            - the XPM_FIFO_ASYNC macro claims no instantiation support, and also looks like it has weird
@@ -342,20 +333,19 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         """)
         self.bus = wishbone.Interface()
 
-        self.command = CSRStorage(
-            description="Write individual bits to issue special commands to SPI; setting multiple bits at once leads to undefined behavior.",
+        self.command = CSRStorage(description="Write individual bits to issue special commands to SPI; setting multiple bits at once leads to undefined behavior.",
             fields=[
                 CSRField("wakeup", size=1, description="Sequence through init & wakeup routine"),
                 CSRField("sector_erase", size=1, description="Erase a sector"),
             ])
         self.sector = CSRStorage(description="Sector to erase",
-                                 fields=[
-                                     CSRField("sector", size=32, description="Sector to erase")
-                                 ])
+            fields=[
+                CSRField("sector", size=32, description="Sector to erase")
+            ])
         self.status = CSRStatus(description="Interface status",
-                                fields=[
-                                    CSRField("wip", size=1, description="Operation in progress (write or erease)")
-                                ])
+            fields=[
+                CSRField("wip", size=1, description="Operation in progress (write or erease)")
+            ])
         # TODO: implement ECC detailed register readback, CRC checking
 
         # PHY machine mux --------------------------------------------------------------------------
@@ -372,12 +362,11 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.comb += do_mux_rise.eq(~self.spi_mode & do_rise[0] | self.spi_mode & self.mosi)
         self.comb += do_mux_fall.eq(~self.spi_mode & do_fall[0] | self.spi_mode & self.mosi)
 
-        has_dummy = Signal()  # indicates if the current "req" requires dummy cycles to be appended (used for both OPI/SPI)
-        rom_addr = Signal(32,
-                          reset=0xFFFFFFFC)  # location of the internal ROM address pointer; reset to invalid address to force an address request on first read
+        has_dummy = Signal() # indicates if the current "req" requires dummy cycles to be appended (used for both OPI/SPI)
+        rom_addr = Signal(32, reset=0xFFFFFFFC)  # location of the internal ROM address pointer; reset to invalid address to force an address request on first read
 
         # MAC/PHY abstraction for OPI
-        txphy_do = Signal(16)  # two sources of data out for OPI, one from the PHY, one from MAC
+        txphy_do = Signal(16) # two sources of data out for OPI, one from the PHY, one from MAC
         txcmd_do = Signal(16)
         opi_di = Signal(16)
 
@@ -409,55 +398,53 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             # sampling 32-bit data into the FIFO.
             Instance("FDCE", name="FDCE_WREN",
                      i_C=dqs_iobuf, i_D=~wrendiv, o_Q=wrendiv, i_CE=1, i_CLR=~rx_wren,
-                     ),
+            ),
             Instance("FDCE", name="FDCE_WREN",
                      i_C=dqs_iobuf, i_D=~wrendiv2, o_Q=wrendiv2, i_CE=wrendiv & ~wrendiv2, i_CLR=~rx_wren,
-                     ),
+            ),
             # Direct FIFO primitive is more resource-efficient and faster than migen primitive.
             Instance("FIFO_DUALCLOCK_MACRO",
                      p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, p_FIRST_WORD_FALL_THROUGH="TRUE",
-                     p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512 - (8 * prefetch_lines)),
+                     p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512- (8*prefetch_lines)),
 
                      o_ALMOSTEMPTY=rx_almostempty, o_ALMOSTFULL=rx_almostfull,
                      o_DO=opi_fifo_rd, o_EMPTY=rx_empty, o_FULL=rx_full,
                      o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr,
                      i_DI=opi_fifo_wd, i_RDCLK=ClockSignal(), i_RDEN=rx_rden,
                      i_WRCLK=dqs_iobuf, i_WREN=wrendiv & wrendiv2, i_RST=rx_fifo_rst,
-                     )
+            )
         ]
         self.sync.dqs += opi_di.eq(self.di)
         self.comb += opi_fifo_wd.eq(Cat(opi_di, self.di))
 
-        # ---------  OPI Rx Phy machine ------------------------------
+        #---------  OPI Rx Phy machine ------------------------------
         self.submodules.rxphy = rxphy = FSM(reset_state="IDLE")
         cti_pipe = Signal(3)
         rxphy_cnt = Signal(3)
         rxphy.act("IDLE",
                   If(self.spi_mode,
                      NextState("IDLE"),
-                     ).Else(
+                  ).Else(
                       NextValue(self.bus.ack, 0),
                       If(opi_reset_rx_req,
                          NextState("WAIT_RESET"),
                          NextValue(rxphy_cnt, 6),
                          NextValue(rx_wren, 0),
                          NextValue(rx_fifo_rst, 1),
-                         ).Elif(opi_rx_run,
-                                NextValue(rx_wren, 1),
-                                If((self.bus.cyc & self.bus.stb & ~self.bus.we) & ((self.bus.cti == 2) |
-                                                                                   ((
-                                                                                                self.bus.cti == 7) & ~self.bus.ack)),
-                                   # handle case of non-pipelined read, ack is late
-                                   If(~rx_empty,
-                                      NextValue(self.bus.dat_r, opi_fifo_rd),
-                                      rx_rden.eq(1),
-                                      NextValue(opi_addr, opi_addr + 4),
-                                      NextValue(self.bus.ack, 1),
-                                      )
-                                   )
-                                )
-                  )
+                      ).Elif(opi_rx_run,
+                              NextValue(rx_wren, 1),
+                              If( (self.bus.cyc & self.bus.stb & ~self.bus.we) & ((self.bus.cti == 2) |
+                                 ((self.bus.cti == 7) & ~self.bus.ack) ), # handle case of non-pipelined read, ack is late
+                                 If(~rx_empty,
+                                    NextValue(self.bus.dat_r, opi_fifo_rd),
+                                    rx_rden.eq(1),
+                                    NextValue(opi_addr, opi_addr + 4),
+                                    NextValue(self.bus.ack, 1),
+                                 )
+                              )
+                      )
                   )
+        )
         rxphy.act("WAIT_RESET",
                   NextValue(opi_addr, Cat(Signal(2), self.bus.adr)),
                   NextValue(rxphy_cnt, rxphy_cnt - 1),
@@ -465,8 +452,9 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                      NextValue(rx_fifo_rst, 0),
                      opi_reset_rx_ack.eq(1),
                      NextState("IDLE"),
-                     )
                   )
+        )
+
 
         # TxPHY machine: OPI -------------------------------------------------------------------------
         txphy_cnt = Signal(4)
@@ -477,12 +465,12 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         txcmd_clken = Signal()
         txphy_oe = Signal()
         txcmd_oe = Signal()
-        self.sync += opi_cs_n.eq((tx_run & txphy_cs_n) | (~tx_run & txcmd_cs_n))
-        self.comb += If(tx_run, self.do.eq(txphy_do)).Else(self.do.eq(txcmd_do))
-        self.comb += opi_clk_en.eq((tx_run & txphy_clken) | (~tx_run & txcmd_clken))
-        self.comb += self.tx.eq((tx_run & txphy_oe) | (~tx_run & txcmd_oe))
+        self.sync += opi_cs_n.eq( (tx_run & txphy_cs_n) | (~tx_run & txcmd_cs_n) )
+        self.comb += If( tx_run, self.do.eq(txphy_do) ).Else( self.do.eq(txcmd_do) )
+        self.comb += opi_clk_en.eq( (tx_run & txphy_clken) | (~tx_run & txcmd_clken) )
+        self.comb += self.tx.eq( (tx_run & txphy_oe) | (~tx_run & txcmd_oe) )
         tx_almostfull = Signal()
-        self.sync += tx_almostfull.eq(rx_almostfull)  # sync the rx_almostfull signal into the local clock domain
+        self.sync += tx_almostfull.eq(rx_almostfull) # sync the rx_almostfull signal into the local clock domain
         txphy_bus = Signal()
         self.sync += txphy_bus.eq(self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))
         tx_resetcycle = Signal()
@@ -496,35 +484,35 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                   # guarantee that the first state we go to out of reset is a four-cycle burst
                   NextValue(txphy_cnt, 4),
                   If(tx_run & ~self.spi_mode, NextState("TX_SETUP"))
-                  )
+        )
         txphy.act("TX_SETUP",
                   NextValue(opi_rx_run, 0),
                   NextValue(txphy_cnt, txphy_cnt - 1),
-                  If(txphy_cnt > 0,
-                     NextValue(txphy_cs_n, 1)
-                     ).Else(
+                  If( txphy_cnt > 0,
+                      NextValue(txphy_cs_n, 1)
+                  ).Else(
                       NextValue(txphy_cs_n, 0),
                       NextValue(txphy_oe, 1),
                       NextState("TX_CMD_CS_DELAY"),
                   )
-                  )
+        )
         txphy.act("TX_CMD_CS_DELAY",  # meet setup timing for CS-to-clock
                   NextState("TX_CMD"),
-                  )
+        )
         txphy.act("TX_CMD",
                   NextValue(txphy_do, 0xEE11),
                   NextValue(txphy_clken, 1),
                   NextState("TX_ADRHI"),
-                  )
+        )
         txphy.act("TX_ADRHI",
-                  NextValue(txphy_do, opi_addr[16:] & 0x07FF),  # mask off unused bits
+                  NextValue(txphy_do, opi_addr[16:] & 0x07FF), # mask off unused bits
                   NextState("TX_ADRLO"),
-                  )
+        )
         txphy.act("TX_ADRLO",
                   NextValue(txphy_do, opi_addr[:16]),
                   NextValue(txphy_cnt, self.config.fields.dummy - 1),
                   NextState("TX_DUMMY"),
-                  )
+        )
         txphy.act("TX_DUMMY",
                   NextValue(txphy_oe, 0),
                   NextValue(txphy_do, 0),
@@ -535,50 +523,50 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                         NextValue(txphy_clken, 1),
                         NextValue(opi_reset_rx_req, 1),
                         NextState("TX_RESET_RX"),
-                        ).Else(
-                         NextState("TX_FILL"),
-                     )
+                     ).Else(
+                        NextState("TX_FILL"),
                      )
                   )
+        )
         txphy.act("TX_FILL",
                   If(tx_run,
-                     If(((~txphy_bus & (self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))) &
-                         (opi_addr[2:] != self.bus.adr)) | tx_resetcycle,
-                        # it's a new bus cycle, and the requested address is not equal to the current read buffer address
-                        NextValue(txphy_clken, 1),
-                        NextValue(opi_reset_rx_req, 1),
-                        NextState("TX_RESET_RX"),
-                        ).Else(
-                         If(tx_almostfull & ~self.bus.ack,
-                            NextValue(txphy_clken, 0)
-                            ).Else(
+                     If( ((~txphy_bus & (self.bus.cyc & self.bus.stb & ~self.bus.we & (self.bus.cti == 2))) &
+                          (opi_addr[2:] != self.bus.adr)) | tx_resetcycle,
+                         # it's a new bus cycle, and the requested address is not equal to the current read buffer address
+                         NextValue(txphy_clken, 1),
+                         NextValue(opi_reset_rx_req, 1),
+                         NextState("TX_RESET_RX"),
+                     ).Else(
+                          If(tx_almostfull & ~self.bus.ack,
+                             NextValue(txphy_clken, 0)
+                          ).Else(
                              NextValue(txphy_clken, 1)
-                         )
+                          )
                      ),
                      If(~(self.bus.cyc & self.bus.stb),
                         NextValue(opi_rx_run, 0),
-                        ).Else(
+                     ).Else(
                          NextValue(opi_rx_run, 1),
                      )
-                     ).Else(
+                  ).Else(
                       NextValue(txphy_clken, 0),
                       NextState("RESET")
                   )
-                  )
-        txphy.act("TX_RESET_RX",  # keep clocking the RX until it acknowledges a reset
+        )
+        txphy.act("TX_RESET_RX", # keep clocking the RX until it acknowledges a reset
                   NextValue(opi_rx_run, 0),
                   NextValue(opi_reset_rx_req, 0),
                   If(opi_reset_rx_ack,
                      NextValue(txphy_clken, 0),
-                     NextValue(txphy_cnt, 0),  # 1 cycle CS on back-to-back reads
+                     NextValue(txphy_cnt, 0), # 1 cycle CS on back-to-back reads
                      NextValue(txphy_cs_n, 1),
                      NextState("TX_SETUP"),
-                     ).Else(
+                  ).Else(
                       NextValue(txphy_clken, 1),
                   )
-                  )
+        )
 
-        # ---------  OPI CMD machine ------------------------------
+        #---------  OPI CMD machine ------------------------------
         self.submodules.opicmd = opicmd = FSM(reset_state="RESET")
         opicmd.act("RESET",
                    NextValue(txcmd_do, 0),
@@ -587,18 +575,18 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                    NextValue(txcmd_cs_n, 1),
                    If(~self.spi_mode,
                       NextState("IDLE"),
-                      ).Else(NextState("RESET_CYCLE")),
-                   )
+                   ).Else(NextState("RESET_CYCLE")),
+        )
         opicmd.act("RESET_CYCLE",
                    NextValue(txcmd_cs_n, 0),
                    If(opi_reset_rx_ack,
                       NextValue(tx_run, 1),
                       NextState("IDLE"),
-                      ).Else(
+                   ).Else(
                        NextValue(tx_run, 1),
                        tx_resetcycle.eq(1),
                    )
-                   )
+        )
         opicmd.act("IDLE",
                    NextValue(txcmd_cs_n, 1),
                    If(~self.spi_mode,  # this machine stays in idle once spi_mode is dropped
@@ -608,44 +596,44 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                       #   - then run the command
                       # - Else wait until a bus cycle, and once it happens, put the system into run mode
                       If(self.bus.cyc & self.bus.stb,
-                         If(~self.bus.we & (self.bus.cti == 2),
+                         If(~self.bus.we & (self.bus.cti ==2),
                             NextState("TX_RUN")
-                            ).Else(
+                         ).Else(
                              # handle other cases here, e.g. what do we do if we get a write? probably
                              # should just ACK it without doing anything so the CPU doesn't freeze...
                          )
-                         ).Elif(self.command.re,
-                                NextState("DISPATCH_CMD"),
-                                )
+                      ).Elif(self.command.re,
+                             NextState("DISPATCH_CMD"),
                       )
                    )
+        )
         opicmd.act("TX_RUN",
                    NextValue(tx_run, 1),
-                   If(self.command.re,  # respond to commands
+                   If(self.command.re, # respond to commands
                       NextState("WAIT_DISPATCH")
-                      )
                    )
-        opicmd.act("WAIT_DISPATCH",  # wait until the current cycle is done, then stop TX and dispatch command
-                   If(~(self.bus.cyc & self.bus.stb),
+        )
+        opicmd.act("WAIT_DISPATCH", # wait until the current cycle is done, then stop TX and dispatch command
+                   If( ~(self.bus.cyc & self.bus.stb),
                       NextValue(tx_run, 0),
                       NextState("DISPATCH_CMD")
-                      )
                    )
+        )
         opicmd.act("DISPATCH_CMD",
                    If(self.command.fields.sector_erase,
                       NextState("DO_SECTOR_ERASE"),
-                      ).Else(
+                   ).Else(
                        NextState("IDLE"),
                    )
-                   )
+        )
         opicmd.act("DO_SECTOR_ERASE",
                    # placeholder
-                   )
+        )
 
         # MAC/PHY abstraction for the SPI machine
         spi_req = Signal()
         spi_ack = Signal()
-        spi_do = Signal(8)  # this is the API to the machine
+        spi_do = Signal(8) # this is the API to the machine
         spi_di = Signal(8)
 
         # PHY machine: SPI -------------------------------------------------------------------------
@@ -659,14 +647,13 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         # clk_en - both
 
         spicount = Signal(5)
-        spi_so = Signal(8)  # this internal to the machine
+        spi_so = Signal(8) # this internal to the machine
         spi_si = Signal(8)
         spi_dummy = Signal()
-        spi_di_load = Signal()  # spi_do load is pipelined back one cycle using this mechanism
+        spi_di_load = Signal() # spi_do load is pipelined back one cycle using this mechanism
         spi_di_load2 = Signal()
         spi_ack_pipe = Signal()
-        self.sync += [
-            # pipelining is required the MISO path is very slow (IOB->fabric FD), and a falling-edge retiming reg is used to meet timing
+        self.sync += [ # pipelining is required the MISO path is very slow (IOB->fabric FD), and a falling-edge retiming reg is used to meet timing
             spi_di_load2.eq(spi_di_load),
             If(spi_di_load2, spi_di.eq(Cat(self.miso, spi_si[:-1]))).Else(spi_di.eq(spi_di)),
             spi_ack.eq(spi_ack_pipe),
@@ -681,62 +668,63 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                       NextValue(spi_clk_en, 1),
                       NextValue(spi_so, spi_do),
                       NextValue(spi_dummy, has_dummy),
-                      ).Else(
+                   ).Else(
                        NextValue(spi_clk_en, 0),
                        NextValue(spi_ack_pipe, 0),
                        NextValue(spicount, 0),
                        NextValue(spi_dummy, 0),
                    )
-                   )
+        )
         spiphy.act("REQ",
                    If(spicount > 0,
-                      NextValue(spicount, spicount - 1),
+                      NextValue(spicount, spicount-1),
                       NextValue(spi_clk_en, 1),
                       NextValue(spi_so, Cat(0, spi_so[:-1])),
                       NextValue(spi_ack_pipe, 0),
-                      ).Elif((spicount == 0) & spi_req & ~spi_dummy,  # back-to-back transaction
-                             NextValue(spi_clk_en, 1),
-                             NextValue(spicount, 7),
-                             NextValue(spi_clk_en, 1),
-                             NextValue(spi_so, spi_do),  # reload the so register
-                             spi_di_load.eq(1),  # "naked" .eq() create single-cycle pulses that default back to 0
-                             NextValue(spi_ack_pipe, 1),
-                             NextValue(spi_dummy, has_dummy),
-                             ).Elif((spicount == 0) & ~spi_req & ~spi_dummy,  # go back to idle
-                                    spi_di_load.eq(1),
-                                    NextValue(spi_ack_pipe, 1),
-                                    NextValue(spi_clk_en, 0),
-                                    NextState("RESET"),
-                                    ).Elif((spicount == 0) & spi_dummy,
-                                           spi_di_load.eq(1),
-                                           NextValue(spicount, self.config.fields.dummy),
-                                           NextValue(spi_clk_en, 1),
-                                           NextValue(spi_ack_pipe, 0),
-                                           NextValue(spi_so, 0),  # do a dummy with '0' as the output
-                                           NextState("DUMMY"),
-                                           )  # this actually should be a fully defined situation, no "Else" applicable
-                   )
+                   ).Elif( (spicount == 0) & spi_req & ~spi_dummy, # back-to-back transaction
+                          NextValue(spi_clk_en, 1),
+                          NextValue(spicount, 7),
+                          NextValue(spi_clk_en, 1),
+                          NextValue(spi_so, spi_do), # reload the so register
+                          spi_di_load.eq(1), # "naked" .eq() create single-cycle pulses that default back to 0
+                          NextValue(spi_ack_pipe, 1),
+                          NextValue(spi_dummy, has_dummy),
+                   ).Elif( (spicount == 0) & ~spi_req & ~spi_dummy, # go back to idle
+                          spi_di_load.eq(1),
+                          NextValue(spi_ack_pipe, 1),
+                          NextValue(spi_clk_en, 0),
+                          NextState("RESET"),
+                   ).Elif( (spicount == 0) & spi_dummy,
+                          spi_di_load.eq(1),
+                          NextValue(spicount, self.config.fields.dummy),
+                          NextValue(spi_clk_en, 1),
+                          NextValue(spi_ack_pipe, 0),
+                          NextValue(spi_so, 0),  # do a dummy with '0' as the output
+                          NextState("DUMMY"),
+                   ) # this actually should be a fully defined situation, no "Else" applicable
+        )
         spiphy.act("DUMMY",
-                   If(spicount > 1,  # instead of doing dummy-1, we stop at count == 1
+                   If(spicount > 1, # instead of doing dummy-1, we stop at count == 1
                       NextValue(spicount, spicount - 1),
                       NextValue(spi_clk_en, 1),
-                      ).Elif(spicount <= 1 & spi_req,
-                             NextValue(spi_clk_en, 1),
-                             NextValue(spicount, 7),
-                             NextValue(spi_so, spi_do),  # reload the so register
-                             NextValue(spi_ack_pipe, 1),  # finally ack the cycle
-                             NextValue(spi_dummy, has_dummy),
-                             ).Else(
+                   ).Elif(spicount <= 1 & spi_req,
+                          NextValue(spi_clk_en, 1),
+                          NextValue(spicount, 7),
+                          NextValue(spi_so, spi_do),  # reload the so register
+                          NextValue(spi_ack_pipe, 1), # finally ack the cycle
+                          NextValue(spi_dummy, has_dummy),
+                   ).Else(
                        NextValue(spi_clk_en, 0),
                        NextValue(spi_ack_pipe, 1),  # finally ack the cycle
                        NextState("RESET")
                    )
-                   )
+        )
+
 
         # SPI MAC machine -------------------------------------------------------------------------------
         # default active on boot
         addr_updated = Signal()
-        d_to_wb = Signal(32)  # data going back to wishbone
+        d_to_wb = Signal(32) # data going back to wishbone
         mac_count = Signal(5)
         new_cycle = Signal(1)
         self.submodules.mac = mac = FSM(reset_state="RESET")
@@ -752,193 +740,192 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                 NextState("WAKEUP_PRE"),
                 NextValue(new_cycle, 1),
                 If(self.spi_mode, NextValue(self.bus.ack, 0)),
-                )
+        )
         if spiread:
             mac.act("IDLE",
-                    If(self.spi_mode,  # this machine stays in idle once spi_mode is dropped
-                       NextValue(self.bus.ack, 0),
-                       If((self.bus.cyc == 1) & (self.bus.stb == 1) & (self.bus.we == 0) & (self.bus.cti != 7),
-                          # read cycle requested, not end-of-burst
-                          If((rom_addr[2:] != self.bus.adr) & new_cycle,
-                             NextValue(rom_addr, Cat(Signal(2, reset=0), self.bus.adr)),
-                             NextValue(addr_updated, 1),
-                             NextValue(spi_cs_n, 1),  # raise CS in anticipation of a new address cycle
-                             NextState("SPI_READ_32_CS"),
-                             ).Elif((rom_addr[2:] == self.bus.adr) | (~new_cycle & self.bus.cti == 2),
-                                    NextValue(mac_count, 3),  # get another beat of 4 bytes at the next address
-                                    NextState("SPI_READ_32")
-                                    ).Else(
-                              NextValue(addr_updated, 0),
-                              NextValue(spi_cs_n, 0),
-                              NextState("SPI_READ_32"),
-                              NextValue(mac_count, 3),  # prep the MAC state counter to count out 4 bytes
-                          ),
-                          ).Elif(self.command.fields.wakeup,
-                                 NextValue(spi_cs_n, 1),
-                                 NextValue(self.command.storage, 0),  # clear all pending commands
-                                 NextState("WAKEUP_PRE"),
-                                 )
-                       )
+                    If(self.spi_mode, # this machine stays in idle once spi_mode is dropped
+                        NextValue(self.bus.ack, 0),
+                        If((self.bus.cyc == 1) & (self.bus.stb == 1) & (self.bus.we == 0) & (self.bus.cti != 7), # read cycle requested, not end-of-burst
+                           If( (rom_addr[2:] != self.bus.adr) & new_cycle,
+                              NextValue(rom_addr, Cat(Signal(2, reset=0), self.bus.adr)),
+                              NextValue(addr_updated, 1),
+                              NextValue(spi_cs_n, 1), # raise CS in anticipation of a new address cycle
+                              NextState("SPI_READ_32_CS"),
+                           ).Elif( (rom_addr[2:] == self.bus.adr) | (~new_cycle & self.bus.cti == 2),
+                                   NextValue(mac_count, 3),  # get another beat of 4 bytes at the next address
+                                   NextState("SPI_READ_32")
+                           ).Else(
+                               NextValue(addr_updated, 0),
+                               NextValue(spi_cs_n, 0),
+                               NextState("SPI_READ_32"),
+                               NextValue(mac_count, 3),  # prep the MAC state counter to count out 4 bytes
+                           ),
+                        ).Elif(self.command.fields.wakeup,
+                               NextValue(spi_cs_n, 1),
+                               NextValue(self.command.storage, 0),  # clear all pending commands
+                               NextState("WAKEUP_PRE"),
+                        )
                     )
+            )
         else:
             mac.act("IDLE",
-                    If(self.spi_mode,  # this machine stays in idle once spi_mode is dropped
-                       If(self.command.fields.wakeup,
-                          NextValue(spi_cs_n, 1),
-                          NextValue(self.command.storage, 0),  # clear all pending commands
-                          NextState("WAKEUP_PRE"),
-                          )
-                       )
+                    If(self.spi_mode, # this machine stays in idle once spi_mode is dropped
+                        If(self.command.fields.wakeup,
+                               NextValue(spi_cs_n, 1),
+                               NextValue(self.command.storage, 0),  # clear all pending commands
+                               NextState("WAKEUP_PRE"),
+                        )
                     )
+            )
 
-        # ---------  wakup chip ------------------------------
+        #---------  wakup chip ------------------------------
         mac.act("WAKEUP_PRE",
-                NextValue(spi_cs_n, 1),  # why isn't this sticking? i shouldn't have to put this here
+                NextValue(spi_cs_n, 1), # why isn't this sticking? i shouldn't have to put this here
                 NextValue(mac_count, 4),
                 NextState("WAKEUP_PRE_CS_WAIT")
-                )
+        )
         mac.act("WAKEUP_PRE_CS_WAIT",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextState("WAKEUP_WUP"),
                    NextValue(spi_cs_n, 0),
-                   )
                 )
+        )
         mac.act("WAKEUP_WUP",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextValue(spi_cs_n, 0),
                    NextValue(spi_do, 0xab),  # wakeup from deep sleep
                    NextValue(spi_req, 1),
                    NextState("WAKEUP_WUP_WAIT"),
-                   )
                 )
+        )
         mac.act("WAKEUP_WUP_WAIT",
                 NextValue(spi_req, 0),
                 If(spi_ack,
                    NextValue(spi_cs_n, 1),  # raise CS
                    NextValue(mac_count, 4),  # for >4 cycles per specsheet
                    NextState("WAKEUP_CR2_WREN_1")
-                   )
                 )
+        )
 
-        # ---------  WREN+CR2 - dummy cycles ------------------------------
+        #---------  WREN+CR2 - dummy cycles ------------------------------
         mac.act("WAKEUP_CR2_WREN_1",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextValue(spi_cs_n, 0),
                    NextValue(spi_do, 0x06),  # WREN to unlock CR2 writing
                    NextValue(spi_req, 1),
                    NextState("WAKEUP_CR2_WREN_1_WAIT"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_WREN_1_WAIT",
                 NextValue(spi_req, 0),
                 If(spi_ack,
                    NextValue(spi_cs_n, 1),
                    NextValue(mac_count, 4),
                    NextState("WAKEUP_CR2_DUMMY_CMD"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_DUMMY_CMD",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextValue(spi_cs_n, 0),
-                   NextValue(spi_do, 0x72),  # CR2 command
+                   NextValue(spi_do, 0x72), # CR2 command
                    NextValue(spi_req, 1),
                    NextValue(mac_count, 2),
                    NextState("WAKEUP_CR2_DUMMY_ADRHI"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_DUMMY_ADRHI",
-                NextValue(spi_do, 0x00),  # we want to send 00_00_03_00
+                NextValue(spi_do, 0x00),   # we want to send 00_00_03_00
                 If(spi_ack,
-                   NextValue(mac_count, mac_count - 1),
-                   ),
+                   NextValue(mac_count, mac_count -1),
+                ),
                 If(mac_count == 0,
                    NextState("WAKEUP_CR2_DUMMY_ADRMID")
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_DUMMY_ADRMID",
                 NextValue(spi_do, 0x03),
                 If(spi_ack, NextState("WAKEUP_CR2_DUMMY_ADRLO")),
-                )
+        )
         mac.act("WAKEUP_CR2_DUMMY_ADRLO",
                 NextValue(spi_do, 0x00),
                 If(spi_ack, NextState("WAKEUP_CR2_DUMMY_DATA")),
-                )
+        )
         mac.act("WAKEUP_CR2_DUMMY_DATA",
-                NextValue(spi_do, 0x05),  # 10 dummy cycles as required for 84MHz-104MHz operation
+                NextValue(spi_do, 0x05),   # 10 dummy cycles as required for 84MHz-104MHz operation
                 If(spi_ack, NextState("WAKEUP_CR2_DUMMY_WAIT")),
-                )
+        )
         mac.act("WAKEUP_CR2_DUMMY_WAIT",
                 NextValue(spi_req, 0),
                 If(spi_ack,
                    NextValue(spi_cs_n, 1),
                    NextValue(mac_count, 4),
                    NextState("WAKEUP_CR2_WREN_2")
-                   )
                 )
+        )
 
-        # ---------  WREN+CR2 to DOPI mode ------------------------------
+        #---------  WREN+CR2 to DOPI mode ------------------------------
         mac.act("WAKEUP_CR2_WREN_2",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextValue(spi_cs_n, 0),
                    NextValue(spi_do, 0x06),  # WREN to unlock CR2 writing
                    NextValue(spi_req, 1),
                    NextState("WAKEUP_CR2_WREN_2_WAIT"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_WREN_2_WAIT",
                 NextValue(spi_req, 0),
                 If(spi_ack,
                    NextValue(spi_cs_n, 1),
                    NextValue(mac_count, 4),
                    NextState("WAKEUP_CR2_DOPI_CMD"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_DOPI_CMD",
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextValue(spi_cs_n, 0),
                    NextValue(spi_do, 0x72),  # CR2 command
                    NextValue(spi_req, 1),
                    NextValue(mac_count, 4),
                    NextState("WAKEUP_CR2_DOPI_ADR"),
-                   )
                 )
+        )
         mac.act("WAKEUP_CR2_DOPI_ADR",  # send 0x00_00_00_00 as address
                 NextValue(spi_do, 0x00),  # no need to raise CS or lower spi_req, this is back-to-back
                 If(spi_ack,
                    NextValue(mac_count, mac_count - 1),
-                   ),
+                ),
                 If(mac_count == 0,
                    NextState("WAKEUP_CR2_DOPI_DATA"),
-                   )
-                ),
-        mac.act("WAKEUP_CR2_DOPI_DATA",
-                NextValue(spi_do, 2),  # enable DOPI mode
-                If(spi_ack, NextState("WAKEUP_CR2_DOPI_WAIT")),
                 )
-        mac.act("WAKEUP_CR2_DOPI_WAIT",  # trailing CS wait
+        ),
+        mac.act("WAKEUP_CR2_DOPI_DATA",
+                NextValue(spi_do, 2),    # enable DOPI mode
+                If(spi_ack, NextState("WAKEUP_CR2_DOPI_WAIT")),
+        )
+        mac.act("WAKEUP_CR2_DOPI_WAIT", # trailing CS wait
                 NextValue(spi_req, 0),
                 If(spi_ack,
                    NextValue(spi_cs_n, 1),
                    NextValue(mac_count, 4),
                    NextState("WAKEUP_CS_EXIT")
-                   )
                 )
+        )
         mac.act("WAKEUP_CS_EXIT",
                 NextValue(self.spi_mode, 0),  # now enter DOPI mode
-                NextValue(mac_count, mac_count - 1),
+                NextValue(mac_count, mac_count-1),
                 If(mac_count == 0,
                    NextState("IDLE"),
-                   )
                 )
+        )
 
         if spiread:
-            # ---------  SPI read machine ------------------------------
+            #---------  SPI read machine ------------------------------
             mac.act("SPI_READ_32",
                     If(addr_updated,
                        NextState("SPI_READ_32_CS"),
@@ -946,87 +933,85 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
                        NextValue(mac_count, 3),
                        NextValue(spi_cs_n, 1),
                        NextValue(spi_req, 0),
-                       ).Else(
+                    ).Else(
                         If(mac_count > 0,
                            NextValue(has_dummy, 0),
                            NextValue(spi_req, 1),
                            NextState("SPI_READ_32_D")
-                           ).Else(
+                        ).Else(
                             NextValue(spi_req, 0),
                             If(spi_ack,
-                               If(self.spi_mode,
-                                  # protect these in a spi_mode mux to prevent excess inference of logic to handle otherwise implicit dual-master situation
-                                  NextValue(self.bus.dat_r, Cat(d_to_wb[8:], spi_di)),
-                                  NextValue(self.bus.ack, 1),
-                                  ),
+                               If(self.spi_mode,  # protect these in a spi_mode mux to prevent excess inference of logic to handle otherwise implicit dual-master situation
+                                   NextValue(self.bus.dat_r, Cat(d_to_wb[8:],spi_di)),
+                                   NextValue(self.bus.ack, 1),
+                               ),
                                NextValue(rom_addr, rom_addr + 1),
                                NextState("IDLE")
-                               )
+                            )
                         )
                     )
-                    )
+            )
             mac.act("SPI_READ_32_D",
                     If(spi_ack,
                        # shift in one byte at a time to d_to_wb(32)
-                       NextValue(d_to_wb, Cat(d_to_wb[8:], spi_di, )),
+                       NextValue(d_to_wb, Cat(d_to_wb[8:],spi_di,)),
                        NextValue(mac_count, mac_count - 1),
                        NextState("SPI_READ_32"),
                        NextValue(rom_addr, rom_addr + 1),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_CS",
-                    NextValue(mac_count, mac_count - 1),
+                    NextValue(mac_count, mac_count-1),
                     If(mac_count == 0,
                        NextValue(spi_cs_n, 0),
                        NextState("SPI_READ_32_A0"),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_A0",
-                    NextValue(spi_do, 0x0c),  # 32-bit address write for "fast read" command
+                    NextValue(spi_do, 0x0c), # 32-bit address write for "fast read" command
                     NextValue(spi_req, 1),
                     NextState("SPI_READ_32_A1"),
-                    )
+            )
             mac.act("SPI_READ_32_A1",
-                    NextValue(spi_do, rom_addr[24:] & 0x7),
-                    # queue up MSB to send, leave req high; mask off unused high bits
+                    NextValue(spi_do, rom_addr[24:] & 0x7), # queue up MSB to send, leave req high; mask off unused high bits
                     If(spi_ack,
                        NextState("SPI_READ_32_A2"),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_A2",
                     NextValue(spi_do, rom_addr[16:24]),
                     If(spi_ack,
                        NextState("SPI_READ_32_A3"),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_A3",
                     NextValue(spi_do, rom_addr[8:16]),
                     If(spi_ack,
                        NextState("SPI_READ_32_A4"),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_A4",
                     NextValue(spi_do, rom_addr[:8]),
                     If(spi_ack,
                        NextState("SPI_READ_32_A5"),
-                       )
                     )
+            )
             mac.act("SPI_READ_32_A5",
                     NextValue(spi_do, 0),
                     If(spi_ack,
                        NextState("SPI_READ_32_DUMMY")
-                       )
                     )
+            )
             mac.act("SPI_READ_32_DUMMY",
                     NextValue(spi_req, 0),
                     NextValue(addr_updated, 0),
                     If(spi_ack,
                        NextState("SPI_READ_32"),
                        NextValue(mac_count, 3),  # prep the MAC state counter to count out 4 bytes
-                       ).Else(
+                    ).Else(
                         NextState("SPI_READ_32_DUMMY")
                     )
-                    )
+            )
 
         # Handle ECS_n -----------------------------------------------------------------------------
         # treat ECS_N as an async signal -- just a "rough guide" of problems
@@ -1045,27 +1030,25 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             CSRField("ecc_address", size=32, description="Address of the most recent ECC event")
         ])
         self.ecc_status = CSRStatus(fields=[
-            CSRField("ecc_error", size=1,
-                     description="Live status of the ECS_N bit (ECC error on current packet when low)"),
-            CSRField("ecc_overflow", size=1,
-                     description="More than one ECS_N event has happened since th last time ecc_address was checked")
+            CSRField("ecc_error", size=1, description="Live status of the ECS_N bit (ECC error on current packet when low)"),
+            CSRField("ecc_overflow", size=1, description="More than one ECS_N event has happened since th last time ecc_address was checked")
         ])
 
         self.comb += self.ecc_status.fields.ecc_error.eq(ecs_n)
         self.comb += [
-            ecs_pulse.eq(ecs_n_delay & ~ecs_n),  # falling edge -> positive pulse
+            ecs_pulse.eq(ecs_n_delay & ~ecs_n), # falling edge -> positive pulse
             If(ecs_pulse,
                self.ecc_address.fields.ecc_address.eq(rom_addr),
                If(ecc_reported,
                   self.ecc_status.fields.ecc_overflow.eq(1)
-                  ).Else(
+               ).Else(
                    self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow),
                )
-               ).Else(
+            ).Else(
                 self.ecc_address.fields.ecc_address.eq(self.ecc_address.fields.ecc_address),
                 If(self.ecc_status.we,
                    self.ecc_status.fields.ecc_overflow.eq(0),
-                   ).Else(
+                ).Else(
                     self.ecc_status.fields.ecc_overflow.eq(self.ecc_status.fields.ecc_overflow),
                 )
             )
@@ -1074,7 +1057,7 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             ecs_n_delay.eq(ecs_n),
             If(ecs_pulse,
                ecc_reported.eq(1)
-               ).Elif(self.ecc_address.we,
-                      ecc_reported.eq(0)
-                      )
+            ).Elif(self.ecc_address.we,
+                ecc_reported.eq(0)
+            )
         ]

From 416afd3109c6bb8fa42e76eb1e27ac36f521b663 Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Thu, 6 Feb 2020 17:56:21 +0800
Subject: [PATCH 3/5] add doc comment for event

---
 litex/soc/cores/spiopi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litex/soc/cores/spiopi.py b/litex/soc/cores/spiopi.py
index 12a05d71d..937d8e5ec 100644
--- a/litex/soc/cores/spiopi.py
+++ b/litex/soc/cores/spiopi.py
@@ -1019,7 +1019,7 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.specials += MultiReg(pads.ecs_n, ecs_n)
 
         self.submodules.ev = EventManager()
-        self.ev.ecc_error = EventSourceProcess()  # Falling edge triggered
+        self.ev.ecc_error = EventSourceProcess(description="An ECC event has happened on the current block; triggered by falling edge of ECC_N")
         self.ev.finalize()
         self.comb += self.ev.ecc_error.trigger.eq(ecs_n)
         ecc_reported = Signal()

From d2b394a9be3e4e118b73954ccae5bcbc4cbf072d Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Thu, 6 Feb 2020 17:58:02 +0800
Subject: [PATCH 4/5] update doc comments on events for i2s

---
 litex/soc/cores/i2s.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/litex/soc/cores/i2s.py b/litex/soc/cores/i2s.py
index 1125f7920..512b21646 100644
--- a/litex/soc/cores/i2s.py
+++ b/litex/soc/cores/i2s.py
@@ -131,11 +131,11 @@ class i2s_slave(Module, AutoCSR, AutoDoc):
         # interrupts
         self.submodules.ev = EventManager()
         if hasattr(pads, 'rx'):
-            self.ev.rx_ready = EventSourcePulse()  # rising edge triggered, indicates FIFO is ready to read
-            self.ev.rx_error = EventSourcePulse()  # indicates a rx error has happened
+            self.ev.rx_ready = EventSourcePulse(description="Indicates FIFO is ready to read")  # rising edge triggered
+            self.ev.rx_error = EventSourcePulse(description="Indicates an Rx error has happened (over/underflow)")
         if hasattr(pads, 'tx'):
-            self.ev.tx_ready = EventSourcePulse()  # indicates space available in Tx buffer for next quantum
-            self.ev.tx_error = EventSourcePulse()  # indicates a tx error has happened
+            self.ev.tx_ready = EventSourcePulse(description="Indicates enough space available for next Tx quanta of {} words".format(fifodepth))
+            self.ev.tx_error = EventSourcePulse(description="Indicates a Tx error has happened (over/underflow")
         self.ev.finalize()
 
 

From 98e46c2708810bbbeae325a0667d9be3aff88180 Mon Sep 17 00:00:00 2001
From: bunnie <bunnie@kosagi.com>
Date: Thu, 6 Feb 2020 21:55:44 +0800
Subject: [PATCH 5/5] reduce indents

---
 litex/soc/cores/spiopi.py | 78 +++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/litex/soc/cores/spiopi.py b/litex/soc/cores/spiopi.py
index 937d8e5ec..c9237c37c 100644
--- a/litex/soc/cores/spiopi.py
+++ b/litex/soc/cores/spiopi.py
@@ -130,25 +130,25 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             if sim == False:
                 if i == 1: # only wire up o_CNTVALUEOUT for one instance
                     self.specials += Instance("IDELAYE2",
-                             p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                             p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
-                             p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
+                         p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                         p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                         p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                             i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
-                             i_LD=self.delay_update,
-                             i_CNTVALUEIN=self.delay_config.fields.d, o_CNTVALUEOUT=self.delay_status.fields.q,
-                             i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
+                         i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                         i_LD=self.delay_update,
+                         i_CNTVALUEIN=self.delay_config.fields.d, o_CNTVALUEOUT=self.delay_status.fields.q,
+                         i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
                     ),
                 else: # don't wire up o_CNTVALUEOUT for others
                     self.specials += Instance("IDELAYE2",
-                              p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                              p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
-                              p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
+                          p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                          p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                          p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                              i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
-                              i_LD=self.delay_update,
-                              i_CNTVALUEIN=self.delay_config.fields.d,
-                              i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
+                          i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                          i_LD=self.delay_update,
+                          i_CNTVALUEIN=self.delay_config.fields.d,
+                          i_IDATAIN=dq.i[i-1], o_DATAOUT=dq_delayed[i],
                   ),
             else:
                 self.comb += dq_delayed[i].eq(dq.i[i-1])
@@ -160,7 +160,7 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         # SPI SDR register
         self.specials += [
             Instance("FDRE", name="{}".format(miso_name), i_C=~ClockSignal("spinor"), i_CE=1, i_R=0, o_Q=self.miso,
-                     i_D=dq_delayed[1],
+                 i_D=dq_delayed[1],
             )
         ]
 
@@ -184,15 +184,15 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         if sim == False:
             self.specials += [
             Instance("IDELAYE2",
-                     p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
-                     p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
-                     p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
+                 p_DELAY_SRC="IDATAIN", p_SIGNAL_PATTERN="DATA",
+                 p_CINVCTRL_SEL="FALSE", p_HIGH_PERFORMANCE_MODE="FALSE", p_REFCLK_FREQUENCY=200.0,
+                 p_PIPE_SEL="FALSE", p_IDELAY_VALUE=dq_delay_taps, p_IDELAY_TYPE=delay_type,
 
-                     i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
-                     i_LD=self.delay_update,
-                     i_CNTVALUEIN=self.delay_config.fields.d,
-                     i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0],
-                     ),
+                 i_C=ClockSignal(), i_CINVCTRL=0, i_REGRST=0, i_LDPIPEEN=0, i_INC=0, i_CE=0,
+                 i_LD=self.delay_update,
+                 i_CNTVALUEIN=self.delay_config.fields.d,
+                 i_IDATAIN=dq_mosi.i, o_DATAOUT=dq_delayed[0],
+                 ),
             ]
         else:
             self.comb += dq_delayed[0].eq(dq_mosi.i)
@@ -202,14 +202,14 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
         self.specials += [
             # de-activate the CCLK interface, parallel it with a GPIO
             Instance("STARTUPE2",
-                     i_CLK=0, i_GSR=0, i_GTS=0, i_KEYCLEARB=0, i_PACK=0, i_USRDONEO=1, i_USRDONETS=1,
-                     i_USRCCLKO=0, i_USRCCLKTS=1,  # force to tristate
-                     ),
+                 i_CLK=0, i_GSR=0, i_GTS=0, i_KEYCLEARB=0, i_PACK=0, i_USRDONEO=1, i_USRDONETS=1,
+                 i_USRCCLKO=0, i_USRCCLKTS=1,  # force to tristate
+                 ),
             Instance("ODDR", name=sclk_name, # need to name this so we can constrain it properly
-                     p_DDR_CLK_EDGE="SAME_EDGE",
-                     i_C=ClockSignal("spinor"), i_R=ResetSignal("spinor"), i_S=0, i_CE=1,
-                     i_D1=clk_en, i_D2=0, o_Q=pads.sclk,
-                     )
+                 p_DDR_CLK_EDGE="SAME_EDGE",
+                 i_C=ClockSignal("spinor"), i_R=ResetSignal("spinor"), i_S=0, i_CE=1,
+                 i_D1=clk_en, i_D2=0, o_Q=pads.sclk,
+                 )
         ]
 
         # wire up CS_N
@@ -397,21 +397,21 @@ class SpiOpi(Module, AutoCSR, AutoDoc):
             # two DQS strobes (as they are pipe-filling) and (b) alternates with the correct phase so we are
             # sampling 32-bit data into the FIFO.
             Instance("FDCE", name="FDCE_WREN",
-                     i_C=dqs_iobuf, i_D=~wrendiv, o_Q=wrendiv, i_CE=1, i_CLR=~rx_wren,
+                 i_C=dqs_iobuf, i_D=~wrendiv, o_Q=wrendiv, i_CE=1, i_CLR=~rx_wren,
             ),
             Instance("FDCE", name="FDCE_WREN",
-                     i_C=dqs_iobuf, i_D=~wrendiv2, o_Q=wrendiv2, i_CE=wrendiv & ~wrendiv2, i_CLR=~rx_wren,
+                 i_C=dqs_iobuf, i_D=~wrendiv2, o_Q=wrendiv2, i_CE=wrendiv & ~wrendiv2, i_CLR=~rx_wren,
             ),
             # Direct FIFO primitive is more resource-efficient and faster than migen primitive.
             Instance("FIFO_DUALCLOCK_MACRO",
-                     p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, p_FIRST_WORD_FALL_THROUGH="TRUE",
-                     p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512- (8*prefetch_lines)),
+                 p_DEVICE="7SERIES", p_FIFO_SIZE="18Kb", p_DATA_WIDTH=32, p_FIRST_WORD_FALL_THROUGH="TRUE",
+                 p_ALMOST_EMPTY_OFFSET=6, p_ALMOST_FULL_OFFSET=(512- (8*prefetch_lines)),
 
-                     o_ALMOSTEMPTY=rx_almostempty, o_ALMOSTFULL=rx_almostfull,
-                     o_DO=opi_fifo_rd, o_EMPTY=rx_empty, o_FULL=rx_full,
-                     o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr,
-                     i_DI=opi_fifo_wd, i_RDCLK=ClockSignal(), i_RDEN=rx_rden,
-                     i_WRCLK=dqs_iobuf, i_WREN=wrendiv & wrendiv2, i_RST=rx_fifo_rst,
+                 o_ALMOSTEMPTY=rx_almostempty, o_ALMOSTFULL=rx_almostfull,
+                 o_DO=opi_fifo_rd, o_EMPTY=rx_empty, o_FULL=rx_full,
+                 o_RDCOUNT=rx_rdcount, o_RDERR=rx_rderr, o_WRCOUNT=rx_wrcount, o_WRERR=rx_wrerr,
+                 i_DI=opi_fifo_wd, i_RDCLK=ClockSignal(), i_RDEN=rx_rden,
+                 i_WRCLK=dqs_iobuf, i_WREN=wrendiv & wrendiv2, i_RST=rx_fifo_rst,
             )
         ]
         self.sync.dqs += opi_di.eq(self.di)