diff --git a/openhls/compiler/runner.py b/openhls/compiler/runner.py
index aad2b79..d40b84b 100644
--- a/openhls/compiler/runner.py
+++ b/openhls/compiler/runner.py
@@ -3,6 +3,7 @@
 import itertools
 import os
 from textwrap import indent, dedent
+from multiprocessing.pool import ThreadPool
 
 import numpy as np
 
@@ -118,7 +119,8 @@ def parfor(**kwargs):
     kwargs = tuple(tuple(zip_with_scalar(k, range(*v))) for k, v in kwargs.items())
 
     def wrapper(body):
-        for args in itertools.product(*kwargs):
+        def worker(args):
+            print(f"executing {args=}")
             idx = tuple(i for arg, i in args)
             pe_idx = extend_idx(idx)
             state.state.update_current_pe_idx(pe_idx=pe_idx)
@@ -134,6 +136,10 @@ def wrapper(body):
             else:
                 body(**dict(args))
 
+        with ThreadPool(processes=64) as pool:
+            result = pool.map_async(worker, list(itertools.product(*kwargs)))
+            print(result.get())
+
     return wrapper
 
 
diff --git a/openhls/compiler/state.py b/openhls/compiler/state.py
index cd49522..af15d7e 100644
--- a/openhls/compiler/state.py
+++ b/openhls/compiler/state.py
@@ -1,6 +1,6 @@
 import logging
-
-import networkx as nx
+from contextlib import contextmanager
+from threading import RLock
 
 from openhls.config import VAL_PREFIX, DTYPE, DEBUG, INCLUDE_AUX_DEPS
 from openhls.util import extend_idx
@@ -17,7 +17,6 @@
 class State:
     _var_count = 0
     _op_call_count = 0
-    op_graph = nx.MultiDiGraph()
     cst_map = {}
     cst_count = 0
     _pe_idx = (0,)
@@ -26,85 +25,104 @@ class State:
     pe_idx_to_most_recent_op_id = {}
     op_id_to_pe_idx = {}
     pe_deps = set()
+    rlock = None
 
     def __init__(self, output_file):
-        self.op_graph.add_nodes_from(
-            [INPUT_ARG, MEMREF_ARG, GLOBAL_MEMREF_ARG, CONSTANT]
-        )
         self.output_file = output_file
+        self.rlock = RLock()
+
+    @contextmanager
+    def with_rlock(self):
+        self.rlock.acquire()
+        yield
+        self.rlock.release()
 
     def incr_var(self):
-        self._var_count += 1
+        with self.with_rlock():
+            self._var_count += 1
 
     @property
     def curr_var_id(self):
-        return self._var_count
+        with self.with_rlock():
+            return self._var_count
 
     def incr_op_id(self):
-        self._op_call_count += 1
+        with self.with_rlock():
+            self._op_call_count += 1
 
     @property
     def curr_op_id(self):
-        return self._op_call_count
+        with self.with_rlock():
+            return self._op_call_count
 
     def emit(self, *args):
-        print(*args, file=self.output_file)
+        with self.with_rlock():
+            print(*args, file=self.output_file)
 
     def debug_print(self, *args):
         if DEBUG:
-            self.emit(*(["//"] + list(args)))
+            with self.with_rlock():
+                self.emit(*(["//"] + list(args)))
 
     def add_val_source(self, v, src):
-        self.val_source[v] = src
+        with self.with_rlock():
+            self.val_source[v] = src
 
     def add_global_memref_arg(self, v):
-        self.val_source[v] = GLOBAL_MEMREF_ARG
+        with self.with_rlock():
+            self.val_source[v] = GLOBAL_MEMREF_ARG
 
     def add_memref_arg(self, v):
-        self.val_source[v] = MEMREF_ARG
+        with self.with_rlock():
+            self.val_source[v] = MEMREF_ARG
 
     def add_constant(self, v):
-        self.val_source[v] = CONSTANT
+        with self.with_rlock():
+            self.val_source[v] = CONSTANT
 
     def add_op_res(self, v, op):
-        self.val_source[v] = op
+        with self.with_rlock():
+            self.val_source[v] = op
 
     def maybe_add_op(self, op):
-        if op not in self.op_graph.nodes:
-            self.op_graph.add_node(op)
+        pass
 
     def add_edge(self, op, arg, out_v):
-        val_source = self.get_arg_src(arg)
-        self.op_graph.add_edge(val_source, op, input=arg, output=out_v, id=op.op_id)
+        pass
 
     def update_most_recent_pe_idx(self, pe_idx, op):
-        self.pe_idx_to_most_recent_op_id[pe_idx] = op.op_id
+        with self.with_rlock():
+            self.pe_idx_to_most_recent_op_id[pe_idx] = op.op_id
 
     def get_most_recent_op_id(self, pe_idx):
-        return self.pe_idx_to_most_recent_op_id[pe_idx]
+        with self.with_rlock():
+            return self.pe_idx_to_most_recent_op_id[pe_idx]
 
     def maybe_add_aux_dep(self, pe_idx, op):
-        if pe_idx in self.pe_idx_to_most_recent_op_id:
-            prev_op_id = self.get_most_recent_op_id(pe_idx)
-            self.pe_deps.add((prev_op_id, op.op_id))
-        self.update_most_recent_pe_idx(pe_idx, op)
+        with self.with_rlock():
+            if pe_idx in self.pe_idx_to_most_recent_op_id:
+                prev_op_id = self.get_most_recent_op_id(pe_idx)
+                self.pe_deps.add((prev_op_id, op.op_id))
+            self.update_most_recent_pe_idx(pe_idx, op)
 
     def get_arg_src(self, arg):
-        assert arg in self.val_source
-        return self.val_source[arg]
+        with self.with_rlock():
+            assert arg in self.val_source
+            return self.val_source[arg]
 
     def update_current_pe_idx(self, *, pe_idx=None, val=None):
-        assert pe_idx is not None or val is not None
-        if val is not None:
-            src = self.get_arg_src(val)
-            if isinstance(src, str):
-                assert src in {INPUT_ARG, MEMREF_ARG, GLOBAL_MEMREF_ARG, CONSTANT}
-                if src in {MEMREF_ARG, GLOBAL_MEMREF_ARG}:
-                    self.pe_idx = extend_idx(tuple(map(int, val.id.split("_"))))
+        with self.with_rlock():
+            assert pe_idx is not None or val is not None
+            if val is not None:
+                src = self.get_arg_src(val)
+                if isinstance(src, str):
+                    assert src in {INPUT_ARG, MEMREF_ARG, GLOBAL_MEMREF_ARG, CONSTANT}
+                    if src in {MEMREF_ARG, GLOBAL_MEMREF_ARG}:
+                        self.pe_idx = extend_idx(tuple(map(int, val.id.split("_"))))
+                else:
+                    self.pe_idx = src.pe_idx
             else:
-                self.pe_idx = src.pe_idx
-        else:
-            self.pe_idx = pe_idx
+                self.pe_idx = pe_idx
 
     @property
     def dtype(self):
@@ -120,33 +138,41 @@ def val_prefix(self):
 
     @property
     def pe_idx(self):
-        return self._pe_idx
+        with self.with_rlock():
+            return self._pe_idx
 
     @pe_idx.setter
     def pe_idx(self, x):
-        self._pe_idx = x
+        with self.with_rlock():
+            self._pe_idx = x
 
     def map_val_to_pe(self, v, pe_idx):
-        self.val_to_pe_idx[v] = pe_idx
+        with self.with_rlock():
+            self.val_to_pe_idx[v] = pe_idx
 
     def get_val_pe(self, v):
-        return self.val_to_pe_idx[v]
+        with self.with_rlock():
+            return self.val_to_pe_idx[v]
 
     def swap_output_file(self, new_file):
-        old_file = self.output_file
-        self.output_file = new_file
-        return old_file
+        with self.with_rlock():
+            old_file = self.output_file
+            self.output_file = new_file
+            return old_file
 
     def read_output_file(self):
-        self.output_file.seek(0)
-        return self.output_file.read()
+        with self.with_rlock():
+            self.output_file.seek(0)
+            return self.output_file.read()
 
     @property
     def num_unique_pes(self):
-        return len(set(self.val_to_pe_idx.values()))
+        with self.with_rlock():
+            return len(set(self.val_to_pe_idx.values()))
 
     def __del__(self):
-        self.output_file.close()
+        with self.with_rlock():
+            self.output_file.close()
 
 
 state = None
diff --git a/openhls/ir/memref.py b/openhls/ir/memref.py
index 716b230..9fba2ff 100644
--- a/openhls/ir/memref.py
+++ b/openhls/ir/memref.py
@@ -1,3 +1,4 @@
+import sys
 from dataclasses import dataclass
 from typing import Tuple
 
@@ -81,9 +82,18 @@ def reduce_add(self):
     def reduce_max(self):
         return ReduceMax(list(self.registers.flatten()))
 
-    def alias(self, other_memref):
+    def alias(self, other_memref, offsets=None, sizes=None, strides=None):
         assert isinstance(other_memref, MemRef)
-        self.registers = other_memref.registers
+        if offsets is not None and sizes is not None and strides is not None:
+            subview = []
+            for o, si, st in zip(offsets, sizes, strides):
+                subview.append(slice(o, o + si, st))
+            print("subview", subview, file=sys.stderr)
+            print("before subview", self.registers.shape, file=sys.stderr)
+            self.registers = other_memref.registers[tuple(subview)]
+            print("aftier subview", self.registers.shape, file=sys.stderr)
+        else:
+            self.registers = other_memref.registers
 
 
 class GlobalMemRef:
diff --git a/openhls_translate/EmitHLSPy.cpp b/openhls_translate/EmitHLSPy.cpp
index cd2e459..4d4dcf1 100644
--- a/openhls_translate/EmitHLSPy.cpp
+++ b/openhls_translate/EmitHLSPy.cpp
@@ -233,6 +233,7 @@ class ModuleEmitter : public OpenHLSEmitterBase {
   void emitLoad(memref::LoadOp op);
   void emitStore(memref::StoreOp op);
   void emitMemCpy(memref::CopyOp op);
+  void emitMemSubview(memref::SubViewOp op);
   void emitGlobal(memref::GlobalOp op);
   void emitGetGlobal(memref::GetGlobalOp op);
   void emitTensorStore(memref::TensorStoreOp op);
@@ -420,6 +421,7 @@ class StmtVisitor : public HLSVisitorBase<StmtVisitor, bool> {
   bool visitOp(memref::StoreOp op) { return emitter.emitStore(op), true; }
   bool visitOp(memref::DeallocOp op) { return true; }
   bool visitOp(memref::CopyOp op) { return emitter.emitMemCpy(op), true; }
+  bool visitOp(memref::SubViewOp op) { return emitter.emitMemSubview(op), true; }
   bool visitOp(memref::GlobalOp op) { return emitter.emitGlobal(op), true; }
   bool visitOp(memref::GetGlobalOp op) {
     return emitter.emitGetGlobal(op), true;
@@ -1169,33 +1171,40 @@ void ModuleEmitter::emitStore(memref::StoreOp op) {
 }
 
 void ModuleEmitter::emitMemCpy(memref::CopyOp op) {
-//  indent() << "memcpy(";
   indent() << "";
-//  emitValue(op.target());
-//  os << " = ";
   emitValue(op.target());
   os << ".alias(";
   emitValue(op.getSource());
   os << ")";
-//  os << ", ";
+  os << "\n";
+}
 
-//  auto type = op.target().getType().cast<MemRefType>();
-//  os << type.getNumElements() << " * sizeof(" << getTypeName(op.target())
-//     << "))";
-//  os << "\n";
+void ModuleEmitter::emitMemSubview(memref::SubViewOp op) {
+  indent() << "";
+  emitArrayDecl(op.getResult());
+  os << "\n";
+  indent() << "";
+  emitValue(op.result());
+  os << ".alias(";
+  emitValue(op.getSource());
+  os << ", offsets=" << op.getStaticOffsets();
+  os << ", sizes=" << op.getStaticSizes();
+  os << ", strides=" << op.getStaticStrides();
+  os << ")";
   os << "\n";
 }
 
 void ModuleEmitter::emitGlobal(memref::GlobalOp op) {
   auto initial_val = op.initial_value();
   auto elem = initial_val->dyn_cast<DenseFPElementsAttr>();
-  os << op.sym_name().str() << " = np.array([";
-  for (const auto &item : elem.getValues<FloatAttr>())
-    os << item.getValueAsDouble() << ", ";
-  os << "]).reshape(";
-
+  os << op.sym_name().str() << " = np.full((";
   for (const auto &item : elem.getType().getShape())
     os << item << ", ";
+  os << "), ";
+  for (const auto &item : elem.getValues<FloatAttr>()) {
+    os << item.getValueAsDouble();
+    break;
+  }
   os << ")\n";
 }
 
diff --git a/openhls_translate/Visitor.h b/openhls_translate/Visitor.h
index 05a2846..706d91f 100644
--- a/openhls_translate/Visitor.h
+++ b/openhls_translate/Visitor.h
@@ -40,7 +40,7 @@ class HLSVisitorBase {
             // Memref-related statements.
             memref::AllocOp, memref::AllocaOp, memref::LoadOp, memref::StoreOp,
             memref::GlobalOp, memref::GetGlobalOp,
-            memref::DeallocOp, memref::CopyOp, memref::TensorStoreOp,
+            memref::DeallocOp, memref::CopyOp, memref::SubViewOp, memref::TensorStoreOp,
             tensor::ReshapeOp, memref::ReshapeOp, memref::CollapseShapeOp,
             memref::ExpandShapeOp, memref::ReinterpretCastOp,
             bufferization::ToMemrefOp, bufferization::ToTensorOp,
@@ -132,6 +132,7 @@ class HLSVisitorBase {
   HANDLE(memref::GetGlobalOp);
   HANDLE(memref::DeallocOp);
   HANDLE(memref::CopyOp);
+  HANDLE(memref::SubViewOp);
   HANDLE(memref::TensorStoreOp);
   HANDLE(tensor::ReshapeOp);
   HANDLE(memref::ReshapeOp);
diff --git a/pyproject.toml b/pyproject.toml
index 82bb229..07aa8f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 requires = [
     "setuptools>=42",
     "wheel",
-    "cmake==3.21",
+    "cmake>=3.24",
     # MLIR build depends.
     "ninja",
     "numpy==1.23.1",
diff --git a/requirements.txt b/requirements.txt
index 5d73e98..911b737 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,6 @@ numpy
 networkx
 astor
 jinja2
-cocotb==1.6.2
+cocotb
 matplotlib
 xeda
diff --git a/scripts/build.sh b/scripts/build.sh
index a77f10d..8ebbfc0 100755
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -32,6 +32,7 @@ if [ ! -f "${OPENHLS_DIR}"/build/llvm/CMakeCache.txt ]; then
     -DCMAKE_BUILD_TYPE=DEBUG \
     -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
     -DLLVM_TARGETS_TO_BUILD=host \
+    -DPython3_FIND_VIRTUALENV=ONLY \
     -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
     -S "${OPENHLS_DIR}"/externals/llvm-project/llvm \
     -B "${OPENHLS_DIR}"/build/llvm
@@ -137,7 +138,7 @@ if [ ! -f "${OPENHLS_DIR}"/build/flopoco_converter/CMakeCache.txt ]; then
       -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
       -DLLVM_TARGETS_TO_BUILD=host \
       -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-      -S "${OPENHLS_DIR}"/flopoco_convert_ext \
+      -S "${OPENHLS_DIR}"/extensions/flopoco_convert_ext \
       -B "${OPENHLS_DIR}"/build/flopoco_converter
 fi
 
@@ -154,15 +155,4 @@ if [ ! -f "${OPENHLS_DIR}"/build/ghdl/bin/ghdl ]; then
     mkdir -p "${OPENHLS_DIR}"/build/ghdl
     tar -xvf ghdl-gha-ubuntu-20.04-llvm.tgz -C "${OPENHLS_DIR}"/build/ghdl
   fi
-fi
-
-
-# TODO
-#PYBIND11_DIR=${PREFIX}/lib/python3.10/site-packages/pybind11/share/cmake/
-#PYBIND11_DIR=$(python -c "import pybind11; print(pybind11.get_cmake_dir())")
-#-DPYTHON_LIBRARY="/Users/mlevental/miniforge3/envs/openhls/lib/libpython3.10.dylib" -DPYTHON_INCLUDE_DIR="/Users/mlevental/miniforge3/envs/openhls/include/python3.10" \
-
-#      -DPYTHON_INCLUDE_DIR="$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())")"  \
-#      -DPYTHON_LIBRARY="$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")" \
-
-#-Dpybind11_DIR=/home/mlevental/miniconda3/envs/openhls/lib/python3.10/site-packages/pybind11/share/cmake/pybind11 -DPython_EXECUTABLE=/home/mlevental/miniconda3/envs/openhls/bin/python
\ No newline at end of file
+fi
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 60b4f3b..323566e 100644
--- a/setup.py
+++ b/setup.py
@@ -129,14 +129,6 @@ def build_torch_mlir(base_cmake_args):
     )
 
 
-def install_torch_mlir_from_wheel():
-    torch_mlir_wheel = get_latest_torch_mlir()
-    subprocess.check_call(
-        [sys.executable, "-m", "pip", "install", torch_mlir_wheel],
-        cwd=CWD,
-    )
-
-
 def build_circt(base_cmake_args):
     circt_dir = os.path.join(EXTERNALS, "circt")
     circt_build_dir = os.path.join(ROOT_BUILD_DIR, "circt")
@@ -168,6 +160,7 @@ def build_openhls_translate(base_cmake_args):
         f'-DMLIR_DIR={os.path.join(LLVM_BUILD_DIR, "lib", "cmake", "mlir")}',
         f'-DLLVM_DIR={os.path.join(LLVM_BUILD_DIR, "lib", "cmake", "llvm")}',
         "-DMLIR_ENABLE_BINDINGS_PYTHON=ON",
+        "-DLLVM_ENABLE_ABI_BREAKING_CHECKS=OFF"
         f"-Dpybind11_DIR={pybind11.get_cmake_dir()}",
     ]
     run_cmake(openhls_dir, cmake_args, openhls_build_dir, target="openhls_translate")