From b01deddb1b113994b71786cb11408f2c75f96acc Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Fri, 9 Feb 2024 20:34:22 -0800
Subject: [PATCH] Towards dual-ported TLB

---
 src/main/scala/dmem/TLB.scala | 471 +++++++++++++++++-----------------
 src/main/scala/exu/Core.scala |  32 +--
 2 files changed, 255 insertions(+), 248 deletions(-)

diff --git a/src/main/scala/dmem/TLB.scala b/src/main/scala/dmem/TLB.scala
index f0ef927..d6c04da 100644
--- a/src/main/scala/dmem/TLB.scala
+++ b/src/main/scala/dmem/TLB.scala
@@ -37,10 +37,10 @@ class ShuttleDTLBResp(implicit p: Parameters) extends CoreBundle()(p) {
 }
 
 
-class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(p) {
+class ShuttleDTLB(ports: Int, lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: Parameters) extends CoreModule()(p) {
   val io = IO(new Bundle {
-    val req = Flipped(Decoupled(new ShuttleDTLBReq(lgMaxSize)))
-    val resp = Output(new ShuttleDTLBResp)
+    val req = Vec(ports, Flipped(Decoupled(new ShuttleDTLBReq(lgMaxSize))) )
+    val resp = Vec(ports, Output(new ShuttleDTLBResp))
     val sfence = Flipped(Valid(new SFenceReq))
     val ptw = new TLBPTWIO
   })
@@ -50,20 +50,10 @@ class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: P
   require(!usingHypervisor)
   require(usingVM)
 
-  val vpn = io.req.bits.vaddr(vaddrBits-1, pgIdxBits)
-  /** index for sectored_Entry */
-  val memIdx = vpn.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
   /** TLB Entry */
   val sectored_entries = Reg(Vec(cfg.nSets, Vec(cfg.nWays / cfg.nSectors, new TLBEntry(cfg.nSectors, false, false))))
   /** Superpage Entry */
   val superpage_entries = Reg(Vec(cfg.nSuperpageEntries, new TLBEntry(1, true, true)))
-  /** Special Entry
-    *
-    * If PMP granularity is less than page size, thus need additional "special" entry manage PMP.
-    */
-  def ordinary_entries = sectored_entries(memIdx) ++ superpage_entries
-  def all_entries = ordinary_entries
-  def all_real_entries = sectored_entries.flatten ++ superpage_entries
 
   val s_ready :: s_request :: s_wait :: s_wait_invalidate :: Nil = Enum(4)
   val state = RegInit(s_ready)
@@ -74,15 +64,6 @@ class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: P
   val r_sectored_hit = Reg(Valid(UInt(log2Ceil(sectored_entries.head.size).W)))
   val r_superpage_hit = Reg(Valid(UInt(log2Ceil(superpage_entries.size).W)))
 
-  /** privilege mode */
-  val priv = io.req.bits.prv
-  val priv_s = priv(0)
-  // user mode and supervisor mode
-  val priv_uses_vm = priv <= PRV.S.U
-  val satp = io.ptw.ptbr
-  val stage1_en = satp.mode(satp.mode.getWidth-1)
-  val vm_enabled = stage1_en && priv_uses_vm
-
   // share a single physical memory attribute checker (unshare if critical path)
   val refill_ppn = io.ptw.resp.bits.pte.ppn(ppnBits-1, 0)
   /** refill signal */
@@ -90,199 +71,6 @@ class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: P
   /** sfence invalidate refill */
   val invalidate_refill = state.isOneOf(s_request /* don't care */, s_wait_invalidate) || io.sfence.valid
 
-  val mpu_ppn = Mux(do_refill, refill_ppn, io.req.bits.vaddr >> pgIdxBits)
-  val mpu_physaddr = Cat(mpu_ppn, io.req.bits.vaddr(pgIdxBits-1, 0))
-  val mpu_priv = Mux[UInt](do_refill, PRV.S.U, Cat(io.ptw.status.debug, priv))
-
-  // PMA
-  val pma = Module(new PMAChecker(edge)(p))
-  pma.io.paddr := mpu_physaddr
-  // todo: using DataScratchpad doesn't support cacheable.
-  val cacheable = pma.io.resp.cacheable
-  val homogeneous = TLBPageLookup(edge.manager.managers, xLen, p(CacheBlockBytes), BigInt(1) << pgIdxBits)(mpu_physaddr).homogeneous
-  val deny_access_to_debug = mpu_priv <= PRV.M.U && p(DebugModuleKey).map(dmp => dmp.address.contains(mpu_physaddr)).getOrElse(false.B)
-  val prot_r = pma.io.resp.r && !deny_access_to_debug
-  val prot_w = pma.io.resp.w && !deny_access_to_debug
-  val prot_pp = pma.io.resp.pp
-  val prot_al = pma.io.resp.al
-  val prot_aa = pma.io.resp.aa
-  val prot_x = pma.io.resp.x && !deny_access_to_debug
-  val prot_eff = pma.io.resp.eff
-
-  // hit check
-  val sector_hits = sectored_entries(memIdx).map(_.sectorHit(vpn, false.B))
-  val superpage_hits = superpage_entries.map(_.hit(vpn, false.B))
-  val hitsVec = all_entries.map(vm_enabled && _.hit(vpn, false.B))
-  val real_hits = hitsVec.asUInt
-  val hits = Cat(!vm_enabled, real_hits)
-
-  // use ptw response to refill
-  // permission bit arrays
-  when (do_refill) {
-    val pte = io.ptw.resp.bits.pte
-    val refill_v = false.B
-    val newEntry = Wire(new TLBEntryData)
-    newEntry.ppn := pte.ppn
-    newEntry.c := cacheable
-    newEntry.u := pte.u
-    newEntry.g := pte.g && pte.v
-    newEntry.ae_ptw := io.ptw.resp.bits.ae_ptw
-    newEntry.ae_final := io.ptw.resp.bits.ae_final
-    newEntry.ae_stage2 := false.B
-    newEntry.pf := io.ptw.resp.bits.pf
-    newEntry.gf := io.ptw.resp.bits.gf
-    newEntry.hr := io.ptw.resp.bits.hr
-    newEntry.hw := io.ptw.resp.bits.hw
-    newEntry.hx := io.ptw.resp.bits.hx
-    newEntry.sr := pte.sr()
-    newEntry.sw := pte.sw()
-    newEntry.sx := pte.sx()
-    newEntry.pr := prot_r
-    newEntry.pw := prot_w
-    newEntry.px := prot_x
-    newEntry.ppp := prot_pp
-    newEntry.pal := prot_al
-    newEntry.paa := prot_aa
-    newEntry.eff := prot_eff
-    newEntry.fragmented_superpage := io.ptw.resp.bits.fragmented_superpage
-    // refill special_entry
-    when (io.ptw.resp.bits.level < (pgLevels-1).U) {
-      val waddr = r_superpage_repl_addr
-      for ((e, i) <- superpage_entries.zipWithIndex) when (r_superpage_repl_addr === i.U) {
-        e.insert(r_refill_tag, refill_v, io.ptw.resp.bits.level, newEntry)
-        when (invalidate_refill) { e.invalidate() }
-      }
-    // refill sectored_hit
-    }.otherwise {
-      val r_memIdx = r_refill_tag.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
-      val waddr = Mux(r_sectored_hit.valid, r_sectored_hit.bits, r_sectored_repl_addr)
-      for ((e, i) <- sectored_entries(r_memIdx).zipWithIndex) when (waddr === i.U) {
-        when (!r_sectored_hit.valid) { e.invalidate() }
-        e.insert(r_refill_tag, refill_v, 0.U, newEntry)
-        when (invalidate_refill) { e.invalidate() }
-      }
-    }
-  }
-
-  // get all entries data.
-  val entries = all_entries.map(_.getData(vpn))
-  val normal_entries = entries.take(ordinary_entries.size)
-  // parallel query PPN from [[all_entries]], if VM not enabled return VPN instead
-  val ppn = Mux1H(hitsVec :+ !vm_enabled, (all_entries zip entries).map{ case (entry, data) => entry.ppn(vpn, data) } :+ vpn(ppnBits-1, 0))
-
-  val nPhysicalEntries = 1
-  // generally PTW misaligned load exception.
-  val ptw_ae_array = Cat(false.B, entries.map(_.ae_ptw).asUInt)
-  val final_ae_array = Cat(false.B, entries.map(_.ae_final).asUInt)
-  val ptw_pf_array = Cat(false.B, entries.map(_.pf).asUInt)
-  val ptw_gf_array = Cat(false.B, entries.map(_.gf).asUInt)
-  val sum = io.ptw.status.sum
-  // if in hypervisor/machine mode, cannot read/write user entries.
-  // if in superviosr/user mode, "If the SUM bit in the sstatus register is set, supervisor mode software may also access pages with U=1.(from spec)"
-  val priv_rw_ok = Mux(!priv_s || sum, entries.map(_.u).asUInt, 0.U) | Mux(priv_s, ~entries.map(_.u).asUInt, 0.U)
-  // if in hypervisor/machine mode, other than user pages, all pages are executable.
-  // if in superviosr/user mode, only user page can execute.
-  val priv_x_ok = Mux(priv_s, ~entries.map(_.u).asUInt, entries.map(_.u).asUInt)
-  val mxr = io.ptw.status.mxr
-  // "The vsstatus field MXR, which makes execute-only pages readable, only overrides VS-stage page protection.(from spec)"
-  val r_array = Cat(true.B, (priv_rw_ok & (entries.map(_.sr).asUInt | Mux(mxr, entries.map(_.sx).asUInt, 0.U))))
-  val w_array = Cat(true.B, (priv_rw_ok & entries.map(_.sw).asUInt))
-  val x_array = Cat(true.B, (priv_x_ok & entries.map(_.sx).asUInt))
-  // These array is for each TLB entries.
-  // user mode can read: PMA OK, TLB OK, AE OK
-  val pr_array = Cat(Fill(nPhysicalEntries, prot_r), normal_entries.map(_.pr).asUInt) & ~(ptw_ae_array | final_ae_array)
-  // user mode can write: PMA OK, TLB OK, AE OK
-  val pw_array = Cat(Fill(nPhysicalEntries, prot_w), normal_entries.map(_.pw).asUInt) & ~(ptw_ae_array | final_ae_array)
-  // user mode can write: PMA OK, TLB OK, AE OK
-  val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~(ptw_ae_array | final_ae_array)
-  // put effect
-  val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt)
-  // cacheable
-  val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt)
-
-  // vaddr misaligned: vaddr[1:0]=b00
-  val misaligned = (io.req.bits.vaddr & (UIntToOH(io.req.bits.size) - 1.U)).orR
-  def badVA: Bool = {
-    val additionalPgLevels = (satp).additionalPgLevels
-    val extraBits = 0
-    val signed = true
-    val nPgLevelChoices = pgLevels - minPgLevels + 1
-    val minVAddrBits = pgIdxBits + minPgLevels * pgLevelBits + extraBits
-    (for (i <- 0 until nPgLevelChoices) yield {
-      val mask = ((BigInt(1) << vaddrBitsExtended) - (BigInt(1) << (minVAddrBits + i * pgLevelBits - signed.toInt))).U
-      val maskedVAddr = io.req.bits.vaddr & mask
-      additionalPgLevels === i.U && !(maskedVAddr === 0.U || signed.B && maskedVAddr === mask)
-    }).orR
-  }
-  val bad_va =
-    if (!usingVM || (minPgLevels == pgLevels && vaddrBits == vaddrBitsExtended)) false.B
-    else vm_enabled && stage1_en && badVA
-
-  val cmd_lrsc = usingAtomics.B && io.req.bits.cmd.isOneOf(M_XLR, M_XSC)
-  val cmd_amo_logical = usingAtomics.B && isAMOLogical(io.req.bits.cmd)
-  val cmd_amo_arithmetic = usingAtomics.B && isAMOArithmetic(io.req.bits.cmd)
-  val cmd_put_partial = io.req.bits.cmd === M_PWR
-  val cmd_read = isRead(io.req.bits.cmd)
-  val cmd_write = isWrite(io.req.bits.cmd)
-  val cmd_write_perms = cmd_write ||
-    io.req.bits.cmd.isOneOf(M_FLUSH_ALL, M_WOK) // not a write, but needs write permissions
-
-  val lrscAllowed = Mux((usingDataScratchpad || usingAtomicsOnlyForIO).B, 0.U, c_array)
-  val ae_array =
-    Mux(misaligned, eff_array, 0.U) |
-    Mux(cmd_lrsc, ~lrscAllowed, 0.U)
-
-  // access exception needs SoC information from PMA
-  val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U)
-  val ae_st_array =
-    Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) |
-    Mux(cmd_put_partial, ~c_array, 0.U) |
-    Mux(cmd_amo_logical, ~c_array, 0.U) |
-    Mux(cmd_amo_arithmetic, ~c_array, 0.U)
-  val must_alloc_array =
-    Mux(cmd_put_partial, ~c_array, 0.U) |
-    Mux(cmd_amo_logical, ~c_array, 0.U) |
-    Mux(cmd_amo_arithmetic, ~c_array, 0.U) |
-    Mux(cmd_lrsc, ~0.U(c_array.getWidth.W), 0.U)
-  val pf_ld_array = Mux(cmd_read, ((~r_array & ~ptw_ae_array) | ptw_pf_array) & ~ptw_gf_array, 0.U)
-  val pf_st_array = Mux(cmd_write_perms, ((~w_array & ~ptw_ae_array) | ptw_pf_array) & ~ptw_gf_array, 0.U)
-
-  val tlb_hit = (real_hits).orR
-  // leads to s_request
-  val tlb_miss = vm_enabled && !bad_va && !tlb_hit
-
-  val sectored_plru = new SetAssocLRU(cfg.nSets, sectored_entries.head.size, "plru")
-  val superpage_plru = new PseudoLRU(superpage_entries.size)
-  when (io.req.valid && vm_enabled) {
-    // replace
-    when (sector_hits.orR) { sectored_plru.access(memIdx, OHToUInt(sector_hits)) }
-    when (superpage_hits.orR) { superpage_plru.access(OHToUInt(superpage_hits)) }
-  }
-
-  // Superpages create the possibility that two entries in the TLB may match.
-  // This corresponds to a software bug, but we can't return complete garbage;
-  // we must return either the old translation or the new translation.  This
-  // isn't compatible with the Mux1H approach.  So, flush the TLB and report
-  // a miss on duplicate entries.
-  val multipleHits = PopCountAtLeast(real_hits, 2)
-
-  // only pull up req.ready when this is s_ready state.
-  io.req.ready := state === s_ready
-  // page fault
-  io.resp.pf.ld := (bad_va && cmd_read) || (pf_ld_array & hits).orR
-  io.resp.pf.st := (bad_va && cmd_write_perms) || (pf_st_array & hits).orR
-  io.resp.pf.inst := DontCare
-  // access exception
-  io.resp.ae.ld := (ae_ld_array & hits).orR
-  io.resp.ae.st := (ae_st_array & hits).orR
-  io.resp.ae.inst := DontCare
-  // misaligned
-  io.resp.ma.ld := misaligned && cmd_read
-  io.resp.ma.st := misaligned && cmd_write
-  io.resp.ma.inst := DontCare
-  io.resp.miss := do_refill || tlb_miss || multipleHits
-  io.resp.paddr := Cat(ppn, io.req.bits.vaddr(pgIdxBits-1, 0))
-
   io.ptw.req.valid := state === s_request
   io.ptw.req.bits.valid := true.B
   io.ptw.req.bits.bits.addr := r_refill_tag
@@ -291,19 +79,7 @@ class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: P
   io.ptw.req.bits.bits.need_gpa := false.B
 
   val sfence = io.sfence.valid
-  // this is [[s_ready]]
-  // handle miss/hit at the first cycle.
-  // if miss, request PTW(L2TLB).
-  when (io.req.fire && tlb_miss) {
-    state := s_request
-    r_refill_tag := vpn
-    r_superpage_repl_addr := replacementEntry(superpage_entries, superpage_plru.way)
-    r_sectored_repl_addr := replacementEntry(sectored_entries(memIdx), sectored_plru.way(memIdx))
-    r_sectored_hit.valid := sector_hits.orR
-    r_sectored_hit.bits := OHToUInt(sector_hits)
-    r_superpage_hit.valid := superpage_hits.orR
-    r_superpage_hit.bits := OHToUInt(superpage_hits)
-  }
+
   // Handle SFENCE.VMA when send request to PTW.
   // SFENCE.VMA    io.ptw.req.ready     kill
   //       ?                 ?            1
@@ -329,17 +105,248 @@ class ShuttleDTLB(lgMaxSize: Int, cfg: TLBConfig)(implicit edge: TLEdgeOut, p: P
 
   // SFENCE processing logic.
   when (sfence) {
-    assert(!io.sfence.bits.rs1 || (io.sfence.bits.addr >> pgIdxBits) === vpn)
+    for (i <- 0 until ports) {
+      assert(!io.sfence.bits.rs1 || (io.sfence.bits.addr >> pgIdxBits) === (io.req(i).bits.vaddr >> pgIdxBits))
+    }
+    def all_real_entries = sectored_entries.flatten ++ superpage_entries
     for (e <- all_real_entries) {
       val hv = false.B
       val hg = false.B
-      when (!hg && io.sfence.bits.rs1) { e.invalidateVPN(vpn, hv) }
+      when (!hg && io.sfence.bits.rs1) { e.invalidateVPN(io.req(0).bits.vaddr(vaddrBits-1,pgIdxBits), hv) }
         .elsewhen (!hg && io.sfence.bits.rs2) { e.invalidateNonGlobal(hv) }
         .otherwise { e.invalidate(hv || hg) }
     }
   }
-  when (multipleHits || reset.asBool) {
-    all_real_entries.foreach(_.invalidate())
+
+
+  for (i <- (0 until ports).reverse) {
+
+    val vpn = io.req(i).bits.vaddr(vaddrBits-1, pgIdxBits)
+    /** index for sectored_Entry */
+    val memIdx = vpn.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
+    def ordinary_entries = sectored_entries(memIdx) ++ superpage_entries
+    def all_entries = ordinary_entries
+    def all_real_entries = sectored_entries.flatten ++ superpage_entries
+
+
+    /** privilege mode */
+    val priv = io.req(i).bits.prv
+    val priv_s = priv(0)
+    // user mode and supervisor mode
+    val priv_uses_vm = priv <= PRV.S.U
+    val satp = io.ptw.ptbr
+    val stage1_en = satp.mode(satp.mode.getWidth-1)
+    val vm_enabled = stage1_en && priv_uses_vm
+
+    val mpu_ppn = Mux(do_refill, refill_ppn, io.req(i).bits.vaddr >> pgIdxBits)
+    val mpu_physaddr = Cat(mpu_ppn, io.req(i).bits.vaddr(pgIdxBits-1, 0))
+    val mpu_priv = Mux[UInt](do_refill, PRV.S.U, Cat(io.ptw.status.debug, priv))
+
+    // PMA
+    val pma = Module(new PMAChecker(edge)(p))
+    pma.io.paddr := mpu_physaddr
+    // todo: using DataScratchpad doesn't support cacheable.
+    val cacheable = pma.io.resp.cacheable
+    val homogeneous = TLBPageLookup(edge.manager.managers, xLen, p(CacheBlockBytes), BigInt(1) << pgIdxBits)(mpu_physaddr).homogeneous
+    val deny_access_to_debug = mpu_priv <= PRV.M.U && p(DebugModuleKey).map(dmp => dmp.address.contains(mpu_physaddr)).getOrElse(false.B)
+    val prot_r = pma.io.resp.r && !deny_access_to_debug
+    val prot_w = pma.io.resp.w && !deny_access_to_debug
+    val prot_pp = pma.io.resp.pp
+    val prot_al = pma.io.resp.al
+    val prot_aa = pma.io.resp.aa
+    val prot_x = pma.io.resp.x && !deny_access_to_debug
+    val prot_eff = pma.io.resp.eff
+
+    // hit check
+    val sector_hits = sectored_entries(memIdx).map(_.sectorHit(vpn, false.B))
+    val superpage_hits = superpage_entries.map(_.hit(vpn, false.B))
+    val hitsVec = all_entries.map(vm_enabled && _.hit(vpn, false.B))
+    val real_hits = hitsVec.asUInt
+    val hits = Cat(!vm_enabled, real_hits)
+
+    // use ptw response to refill
+    // permission bit arrays
+    if (i == 0) { when (do_refill) {
+      val pte = io.ptw.resp.bits.pte
+      val refill_v = false.B
+      val newEntry = Wire(new TLBEntryData)
+      newEntry.ppn := pte.ppn
+      newEntry.c := cacheable
+      newEntry.u := pte.u
+      newEntry.g := pte.g && pte.v
+      newEntry.ae_ptw := io.ptw.resp.bits.ae_ptw
+      newEntry.ae_final := io.ptw.resp.bits.ae_final
+      newEntry.ae_stage2 := false.B
+      newEntry.pf := io.ptw.resp.bits.pf
+      newEntry.gf := io.ptw.resp.bits.gf
+      newEntry.hr := io.ptw.resp.bits.hr
+      newEntry.hw := io.ptw.resp.bits.hw
+      newEntry.hx := io.ptw.resp.bits.hx
+      newEntry.sr := pte.sr()
+      newEntry.sw := pte.sw()
+      newEntry.sx := pte.sx()
+      newEntry.pr := prot_r
+      newEntry.pw := prot_w
+      newEntry.px := prot_x
+      newEntry.ppp := prot_pp
+      newEntry.pal := prot_al
+      newEntry.paa := prot_aa
+      newEntry.eff := prot_eff
+      newEntry.fragmented_superpage := io.ptw.resp.bits.fragmented_superpage
+      // refill special_entry
+      when (io.ptw.resp.bits.level < (pgLevels-1).U) {
+        val waddr = r_superpage_repl_addr
+        for ((e, i) <- superpage_entries.zipWithIndex) when (r_superpage_repl_addr === i.U) {
+          e.insert(r_refill_tag, refill_v, io.ptw.resp.bits.level, newEntry)
+          when (invalidate_refill) { e.invalidate() }
+        }
+        // refill sectored_hit
+      }.otherwise {
+        val r_memIdx = r_refill_tag.extract(cfg.nSectors.log2 + cfg.nSets.log2 - 1, cfg.nSectors.log2)
+        val waddr = Mux(r_sectored_hit.valid, r_sectored_hit.bits, r_sectored_repl_addr)
+        for ((e, i) <- sectored_entries(r_memIdx).zipWithIndex) when (waddr === i.U) {
+          when (!r_sectored_hit.valid) { e.invalidate() }
+          e.insert(r_refill_tag, refill_v, 0.U, newEntry)
+          when (invalidate_refill) { e.invalidate() }
+        }
+      }
+    }}
+
+    // get all entries data.
+    val entries = all_entries.map(_.getData(vpn))
+    val normal_entries = entries.take(ordinary_entries.size)
+    // parallel query PPN from [[all_entries]], if VM not enabled return VPN instead
+    val ppn = Mux1H(hitsVec :+ !vm_enabled, (all_entries zip entries).map{ case (entry, data) => entry.ppn(vpn, data) } :+ vpn(ppnBits-1, 0))
+
+    val nPhysicalEntries = 1
+    // generally PTW misaligned load exception.
+    val ptw_ae_array = Cat(false.B, entries.map(_.ae_ptw).asUInt)
+    val final_ae_array = Cat(false.B, entries.map(_.ae_final).asUInt)
+    val ptw_pf_array = Cat(false.B, entries.map(_.pf).asUInt)
+    val ptw_gf_array = Cat(false.B, entries.map(_.gf).asUInt)
+    val sum = io.ptw.status.sum
+    // if in hypervisor/machine mode, cannot read/write user entries.
+    // if in superviosr/user mode, "If the SUM bit in the sstatus register is set, supervisor mode software may also access pages with U=1.(from spec)"
+    val priv_rw_ok = Mux(!priv_s || sum, entries.map(_.u).asUInt, 0.U) | Mux(priv_s, ~entries.map(_.u).asUInt, 0.U)
+    // if in hypervisor/machine mode, other than user pages, all pages are executable.
+    // if in superviosr/user mode, only user page can execute.
+    val priv_x_ok = Mux(priv_s, ~entries.map(_.u).asUInt, entries.map(_.u).asUInt)
+    val mxr = io.ptw.status.mxr
+    // "The vsstatus field MXR, which makes execute-only pages readable, only overrides VS-stage page protection.(from spec)"
+    val r_array = Cat(true.B, (priv_rw_ok & (entries.map(_.sr).asUInt | Mux(mxr, entries.map(_.sx).asUInt, 0.U))))
+    val w_array = Cat(true.B, (priv_rw_ok & entries.map(_.sw).asUInt))
+    val x_array = Cat(true.B, (priv_x_ok & entries.map(_.sx).asUInt))
+    // These array is for each TLB entries.
+    // user mode can read: PMA OK, TLB OK, AE OK
+    val pr_array = Cat(Fill(nPhysicalEntries, prot_r), normal_entries.map(_.pr).asUInt) & ~(ptw_ae_array | final_ae_array)
+    // user mode can write: PMA OK, TLB OK, AE OK
+    val pw_array = Cat(Fill(nPhysicalEntries, prot_w), normal_entries.map(_.pw).asUInt) & ~(ptw_ae_array | final_ae_array)
+    // user mode can write: PMA OK, TLB OK, AE OK
+    val px_array = Cat(Fill(nPhysicalEntries, prot_x), normal_entries.map(_.px).asUInt) & ~(ptw_ae_array | final_ae_array)
+    // put effect
+    val eff_array = Cat(Fill(nPhysicalEntries, prot_eff), normal_entries.map(_.eff).asUInt)
+    // cacheable
+    val c_array = Cat(Fill(nPhysicalEntries, cacheable), normal_entries.map(_.c).asUInt)
+
+    // vaddr misaligned: vaddr[1:0]=b00
+    val misaligned = (io.req(i).bits.vaddr & (UIntToOH(io.req(i).bits.size) - 1.U)).orR
+    def badVA: Bool = {
+      val additionalPgLevels = (satp).additionalPgLevels
+      val extraBits = 0
+      val signed = true
+      val nPgLevelChoices = pgLevels - minPgLevels + 1
+      val minVAddrBits = pgIdxBits + minPgLevels * pgLevelBits + extraBits
+        (for (i <- 0 until nPgLevelChoices) yield {
+          val mask = ((BigInt(1) << vaddrBitsExtended) - (BigInt(1) << (minVAddrBits + i * pgLevelBits - signed.toInt))).U
+          val maskedVAddr = io.req(i).bits.vaddr & mask
+          additionalPgLevels === i.U && !(maskedVAddr === 0.U || signed.B && maskedVAddr === mask)
+        }).orR
+    }
+    val bad_va =
+      if (!usingVM || (minPgLevels == pgLevels && vaddrBits == vaddrBitsExtended)) false.B
+      else vm_enabled && stage1_en && badVA
+
+    val cmd_lrsc = usingAtomics.B && io.req(i).bits.cmd.isOneOf(M_XLR, M_XSC)
+    val cmd_amo_logical = usingAtomics.B && isAMOLogical(io.req(i).bits.cmd)
+    val cmd_amo_arithmetic = usingAtomics.B && isAMOArithmetic(io.req(i).bits.cmd)
+    val cmd_put_partial = io.req(i).bits.cmd === M_PWR
+    val cmd_read = isRead(io.req(i).bits.cmd)
+    val cmd_write = isWrite(io.req(i).bits.cmd)
+    val cmd_write_perms = cmd_write ||
+      io.req(i).bits.cmd.isOneOf(M_FLUSH_ALL, M_WOK) // not a write, but needs write permissions
+
+    val lrscAllowed = Mux((usingDataScratchpad || usingAtomicsOnlyForIO).B, 0.U, c_array)
+    val ae_array =
+      Mux(misaligned, eff_array, 0.U) |
+      Mux(cmd_lrsc, ~lrscAllowed, 0.U)
+
+    // access exception needs SoC information from PMA
+    val ae_ld_array = Mux(cmd_read, ae_array | ~pr_array, 0.U)
+    val ae_st_array =
+      Mux(cmd_write_perms, ae_array | ~pw_array, 0.U) |
+      Mux(cmd_put_partial, ~c_array, 0.U) |
+      Mux(cmd_amo_logical, ~c_array, 0.U) |
+      Mux(cmd_amo_arithmetic, ~c_array, 0.U)
+    val must_alloc_array =
+      Mux(cmd_put_partial, ~c_array, 0.U) |
+      Mux(cmd_amo_logical, ~c_array, 0.U) |
+      Mux(cmd_amo_arithmetic, ~c_array, 0.U) |
+      Mux(cmd_lrsc, ~0.U(c_array.getWidth.W), 0.U)
+    val pf_ld_array = Mux(cmd_read, ((~r_array & ~ptw_ae_array) | ptw_pf_array) & ~ptw_gf_array, 0.U)
+    val pf_st_array = Mux(cmd_write_perms, ((~w_array & ~ptw_ae_array) | ptw_pf_array) & ~ptw_gf_array, 0.U)
+
+    val tlb_hit = (real_hits).orR
+    // leads to s_request
+    val tlb_miss = vm_enabled && !bad_va && !tlb_hit
+
+    val sectored_plru = new SetAssocLRU(cfg.nSets, sectored_entries.head.size, "plru")
+    val superpage_plru = new PseudoLRU(superpage_entries.size)
+    when (io.req(i).valid && vm_enabled) {
+      // replace
+      when (sector_hits.orR) { sectored_plru.access(memIdx, OHToUInt(sector_hits)) }
+      when (superpage_hits.orR) { superpage_plru.access(OHToUInt(superpage_hits)) }
+    }
+
+    // Superpages create the possibility that two entries in the TLB may match.
+    // This corresponds to a software bug, but we can't return complete garbage;
+    // we must return either the old translation or the new translation.  This
+    // isn't compatible with the Mux1H approach.  So, flush the TLB and report
+    // a miss on duplicate entries.
+    val multipleHits = PopCountAtLeast(real_hits, 2)
+
+    // only pull up req.ready when this is s_ready state.
+    io.req(i).ready := state === s_ready
+    // page fault
+    io.resp(i).pf.ld := (bad_va && cmd_read) || (pf_ld_array & hits).orR
+    io.resp(i).pf.st := (bad_va && cmd_write_perms) || (pf_st_array & hits).orR
+    io.resp(i).pf.inst := DontCare
+    // access exception
+    io.resp(i).ae.ld := (ae_ld_array & hits).orR
+    io.resp(i).ae.st := (ae_st_array & hits).orR
+    io.resp(i).ae.inst := DontCare
+    // misaligned
+    io.resp(i).ma.ld := misaligned && cmd_read
+    io.resp(i).ma.st := misaligned && cmd_write
+    io.resp(i).ma.inst := DontCare
+    io.resp(i).miss := do_refill || tlb_miss || multipleHits
+    io.resp(i).paddr := Cat(ppn, io.req(i).bits.vaddr(pgIdxBits-1, 0))
+
+    // this is [[s_ready]]
+    // handle miss/hit at the first cycle.
+    // if miss, request PTW(L2TLB).
+    when (io.req(i).fire && tlb_miss) {
+      state := s_request
+      r_refill_tag := vpn
+      r_superpage_repl_addr := replacementEntry(superpage_entries, superpage_plru.way)
+      r_sectored_repl_addr := replacementEntry(sectored_entries(memIdx), sectored_plru.way(memIdx))
+      r_sectored_hit.valid := sector_hits.orR
+      r_sectored_hit.bits := OHToUInt(sector_hits)
+      r_superpage_hit.valid := superpage_hits.orR
+      r_superpage_hit.bits := OHToUInt(superpage_hits)
+    }
+    when (multipleHits || reset.asBool) {
+      all_real_entries.foreach(_.invalidate())
+    }
   }
   /** Decides which entry to be replaced
     *
diff --git a/src/main/scala/exu/Core.scala b/src/main/scala/exu/Core.scala
index c11c95d..bc9f6c1 100644
--- a/src/main/scala/exu/Core.scala
+++ b/src/main/scala/exu/Core.scala
@@ -503,7 +503,7 @@ class ShuttleCore(tile: ShuttleTile, edge: TLEdgeOut)(implicit p: Parameters) ex
 
   //mem
 
-  val dtlb = Module(new ShuttleDTLB(log2Ceil(8), TLBConfig(
+  val dtlb = Module(new ShuttleDTLB(1, log2Ceil(8), TLBConfig(
     tileParams.dcache.get.nTLBSets,
     tileParams.dcache.get.nTLBWays
   ))(edge, p))
@@ -573,22 +573,22 @@ class ShuttleCore(tile: ShuttleTile, edge: TLEdgeOut)(implicit p: Parameters) ex
   val mem_dmem_uop = Mux1H(mem_dmem_oh, mem_uops_reg)
 
   io.ptw_tlb <> dtlb.io.ptw
-  dtlb.io.req.valid := mem_dmem_uop.valid
-  dtlb.io.req.bits.vaddr := RegEnable(io.dmem.req.bits.addr, ex_uops_reg.map(_.valid).orR)
-  dtlb.io.req.bits.size := mem_dmem_uop.bits.mem_size
-  dtlb.io.req.bits.cmd := mem_dmem_uop.bits.ctrl.mem_cmd
-  dtlb.io.req.bits.prv := csr.io.status.dprv
+  dtlb.io.req(0).valid := mem_dmem_uop.valid
+  dtlb.io.req(0).bits.vaddr := RegEnable(io.dmem.req.bits.addr, ex_uops_reg.map(_.valid).orR)
+  dtlb.io.req(0).bits.size := mem_dmem_uop.bits.mem_size
+  dtlb.io.req(0).bits.cmd := mem_dmem_uop.bits.ctrl.mem_cmd
+  dtlb.io.req(0).bits.prv := csr.io.status.dprv
 
   dtlb.io.sfence.valid := mem_dmem_uop.valid && mem_dmem_uop.bits.ctrl.mem_cmd === M_SFENCE
   dtlb.io.sfence.bits.rs1 := mem_dmem_uop.bits.mem_size(0)
   dtlb.io.sfence.bits.rs2 := mem_dmem_uop.bits.mem_size(1)
-  dtlb.io.sfence.bits.addr := dtlb.io.req.bits.vaddr
+  dtlb.io.sfence.bits.addr := dtlb.io.req(0).bits.vaddr
   dtlb.io.sfence.bits.asid := mem_dmem_uop.bits.rs2_data
   dtlb.io.sfence.bits.hv := false.B
   dtlb.io.sfence.bits.hg := false.B
 
   io.dmem.keep_clock_enabled := true.B
-  io.dmem.s1_paddr := dtlb.io.resp.paddr
+  io.dmem.s1_paddr := dtlb.io.resp(0).paddr
   io.dmem.s1_data.data := mem_dmem_uop.bits.rs2_data
   io.dmem.s1_data.mask := DontCare
 
@@ -616,19 +616,19 @@ class ShuttleCore(tile: ShuttleTile, edge: TLEdgeOut)(implicit p: Parameters) ex
       if (i == 0)
         wb_uops_reg(i).bits.fdivin := fp_pipe.io.s1_fpiu_fdiv
     }
-    when (mem_uops_reg(i).valid && ctrl.mem && (!dtlb.io.req.ready || dtlb.io.resp.miss)) {
+    when (mem_uops_reg(i).valid && ctrl.mem && (!dtlb.io.req(0).ready || dtlb.io.resp(0).miss)) {
       wb_uops_reg(i).bits.needs_replay := true.B
     }
 
 
     val (xcpt, cause) = checkExceptions(List(
-      (mem_uops_reg(0).bits.xcpt     , mem_uops_reg(0).bits.xcpt_cause),
-      (ctrl.mem && dtlb.io.resp.ma.st, Causes.misaligned_store.U),
-      (ctrl.mem && dtlb.io.resp.ma.ld, Causes.misaligned_load.U),
-      (ctrl.mem && dtlb.io.resp.pf.st, Causes.store_page_fault.U),
-      (ctrl.mem && dtlb.io.resp.pf.ld, Causes.load_page_fault.U),
-      (ctrl.mem && dtlb.io.resp.ae.st, Causes.store_access.U),
-      (ctrl.mem && dtlb.io.resp.ae.ld, Causes.load_access.U)
+      (mem_uops_reg(0).bits.xcpt        , mem_uops_reg(0).bits.xcpt_cause),
+      (ctrl.mem && dtlb.io.resp(0).ma.st, Causes.misaligned_store.U),
+      (ctrl.mem && dtlb.io.resp(0).ma.ld, Causes.misaligned_load.U),
+      (ctrl.mem && dtlb.io.resp(0).pf.st, Causes.store_page_fault.U),
+      (ctrl.mem && dtlb.io.resp(0).pf.ld, Causes.load_page_fault.U),
+      (ctrl.mem && dtlb.io.resp(0).ae.st, Causes.store_access.U),
+      (ctrl.mem && dtlb.io.resp(0).ae.ld, Causes.load_access.U)
     ))
 
     when (mem_uops_reg(i).valid) {