From 3888650dccadfa1d0c6749dc777c05f2ba27961f Mon Sep 17 00:00:00 2001
From: "Garth N. Wells" <gnw20@cam.ac.uk>
Date: Sat, 11 Jan 2025 14:42:06 +0000
Subject: [PATCH 1/3] Use block preconditioner for mixed Poisson problem 
 (#3580)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Switch to iterative solver for mixed-Poisson demo

* Update for complex types

* Update

* Work on block preconditioned mixed Poisson solver

* Text updates

* Small text fixes

* Disable fast fail

* Skip Hypre solver in complex mode

* Revert CI change

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update python/demo/demo_mixed-poisson.py

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>

* Update output

* Updates

* Updates

* Support zero form

* Simplify

* Install blas in CI

* Install lapack

* Fix typo

* Bump macos-15

* Use superlu_dist

* LU solver work-arounds

* Ruff fix

* Fix typo

* Fix typos

---------

Co-authored-by: Jørgen Schartum Dokken <dokken92@gmail.com>
---
 .github/workflows/ccpp.yml                |   6 +-
 .github/workflows/macos.yml               |   2 +-
 cpp/dolfinx/fem/assemble_vector_impl.h    |   2 +
 cpp/dolfinx/fem/discreteoperators.h       |  18 +-
 python/demo/demo_half_loaded_waveguide.py |   2 +-
 python/demo/demo_mixed-poisson.py         | 387 ++++++++++++++++------
 python/demo/demo_poisson.py               |   4 +-
 python/demo/demo_stokes.py                |  12 +-
 python/dolfinx/fem/dofmap.py              |  11 +-
 python/dolfinx/fem/forms.py               |  80 +++--
 python/dolfinx/fem/function.py            |  17 +-
 python/dolfinx/fem/petsc.py               |  70 ++--
 python/dolfinx/graph.py                   |   3 +
 python/dolfinx/mesh.py                    |   2 +-
 python/dolfinx/plot.py                    |   3 +-
 python/test/conftest.py                   |   2 +-
 python/test/unit/fem/test_assembler.py    |  20 +-
 17 files changed, 432 insertions(+), 209 deletions(-)
diff --git a/.github/workflows/ccpp.yml b/.github/workflows/ccpp.yml
index 28ae5bdd989..9cd218d440f 100644
--- a/.github/workflows/ccpp.yml
+++ b/.github/workflows/ccpp.yml
@@ -78,7 +78,7 @@ jobs:
           find . -type f \( -name "*.cmake" -o -name "*.cmake.in" -o -name "CMakeLists.txt" \) | xargs cmake-format --check
 
   build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-24.04
     needs: [fenicsx-refs, lint]
     env:
       OMPI_ALLOW_RUN_AS_ROOT: 1
@@ -93,7 +93,9 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install catch2 cmake g++ libboost-dev libhdf5-mpi-dev libparmetis-dev libpugixml-dev libspdlog-dev mpi-default-dev ninja-build pkg-config
+          sudo apt-get install catch2 cmake g++ libblas-dev libboost-dev libhdf5-mpi-dev \
+             liblapack-dev libparmetis-dev libpugixml-dev libspdlog-dev mpi-default-dev \
+             ninja-build pkg-config
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 02dd6c144be..f7dff7556c2 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -20,7 +20,7 @@ jobs:
 
   mac-os-build:
     name: macOS Homebrew install and test
-    runs-on: macos-14
+    runs-on: macos-15
     needs: fenicsx-refs
     env:
       PETSC_ARCH: arch-darwin-c-opt
diff --git a/cpp/dolfinx/fem/assemble_vector_impl.h b/cpp/dolfinx/fem/assemble_vector_impl.h
index 6d9879c39fc..9965abc8917 100644
--- a/cpp/dolfinx/fem/assemble_vector_impl.h
+++ b/cpp/dolfinx/fem/assemble_vector_impl.h
@@ -1295,8 +1295,10 @@ void assemble_vector(
   std::shared_ptr<const mesh::Mesh<U>> mesh = L.mesh();
   assert(mesh);
   if constexpr (std::is_same_v<U, scalar_value_type_t<T>>)
+  {
     assemble_vector(b, L, mesh->geometry().dofmap(), mesh->geometry().x(),
                     constants, coefficients);
+  }
   else
   {
     auto x = mesh->geometry().x();
diff --git a/cpp/dolfinx/fem/discreteoperators.h b/cpp/dolfinx/fem/discreteoperators.h
index cfe37525a9f..87740f60d27 100644
--- a/cpp/dolfinx/fem/discreteoperators.h
+++ b/cpp/dolfinx/fem/discreteoperators.h
@@ -28,12 +28,12 @@ namespace dolfinx::fem
 /// a Lagrange finite element function in \f$V_0 \subset H^1\f$ into a
 /// Nédélec (first kind) space \f$V_1 \subset H({\rm curl})\f$, i.e.
 /// \f$\nabla V_0 \rightarrow V_1\f$. If \f$u_0\f$ is the
-/// degree-of-freedom vector associated with \f$V_0\f$, the hen
+/// degree-of-freedom vector associated with \f$V_0\f$, then
 /// \f$u_1=Au_0\f$ where \f$u_1\f$ is the degrees-of-freedom vector for
 /// interpolating function in the \f$H({\rm curl})\f$ space. An example
 /// of where discrete gradient operators are used is the creation of
-/// algebraic multigrid solvers for \f$H({\rm curl})\f$  and
-/// \f$H({\rm div})\f$ problems.
+/// algebraic multigrid solvers for \f$H({\rm curl})\f$ and \f$H({\rm
+/// div})\f$ problems.
 ///
 /// @note The sparsity pattern for a discrete operator can be
 /// initialised using sparsitybuild::cells. The space `V1` should be
@@ -44,9 +44,9 @@ namespace dolfinx::fem
 ///
 /// @param[in] topology Mesh topology
 /// @param[in] V0 Lagrange element and dofmap for corresponding space to
-/// interpolate the gradient from
-/// @param[in] V1 Nédélec (first kind) element and and dofmap for
-/// corresponding space to interpolate into
+/// interpolate the gradient from.
+/// @param[in] V1 Nédélec (first kind) element and dofmap for
+/// corresponding space to interpolate into.
 /// @param[in] mat_set A functor that sets values in a matrix
 template <dolfinx::scalar T,
           std::floating_point U = dolfinx::scalar_value_type_t<T>>
@@ -148,9 +148,9 @@ void discrete_gradient(mesh::Topology& topology,
 /// initialised using sparsitybuild::cells. The space `V1` should be
 /// used for the rows of the sparsity pattern, `V0` for the columns.
 ///
-/// @param[in] V0 The space to interpolate from
-/// @param[in] V1 The space to interpolate to
-/// @param[in] mat_set A functor that sets values in a matrix
+/// @param[in] V0 Space to interpolate from.
+/// @param[in] V1 Space to interpolate to.
+/// @param[in] mat_set Functor that sets values in a matrix.
 template <dolfinx::scalar T, std::floating_point U>
 void interpolation_matrix(const FunctionSpace<U>& V0,
                           const FunctionSpace<U>& V1, auto&& mat_set)
diff --git a/python/demo/demo_half_loaded_waveguide.py b/python/demo/demo_half_loaded_waveguide.py
index b732d6ba23e..e1446276783 100644
--- a/python/demo/demo_half_loaded_waveguide.py
+++ b/python/demo/demo_half_loaded_waveguide.py
@@ -461,7 +461,7 @@ def Omega_v(x):
         # Verify if kz is consistent with the analytical equations
         assert verify_mode(kz, w, h, d, lmbd0, eps_d, eps_v, threshold=1e-4)
 
-        print(f"eigenvalue: {-kz**2}")
+        print(f"eigenvalue: {-(kz**2)}")
         print(f"kz: {kz}")
         print(f"kz/k0: {kz / k0}")
 
diff --git a/python/demo/demo_mixed-poisson.py b/python/demo/demo_mixed-poisson.py
index 751e5be96d1..51086346b19 100644
--- a/python/demo/demo_mixed-poisson.py
+++ b/python/demo/demo_mixed-poisson.py
@@ -8,18 +8,23 @@
 #       jupytext_version: 1.14.1
 # ---
 
-# # Mixed formulation for the Poisson equation
-
-# This demo illustrates how to solve Poisson equation using a mixed
-# (two-field) formulation. In particular, it illustrates how to
+# # Mixed formulation of the Poisson equation with a block-preconditioner/solver
+#
+# This demo illustrates how to solve the Poisson equation using a mixed
+# (two-field) formulation and a block-preconditioned iterative solver.
+# In particular, it illustrates how to
 #
 # * Use mixed and non-continuous finite element spaces.
-# * Set essential boundary conditions for subspaces and $H(\mathrm{div})$ spaces.
+# * Set essential boundary conditions for subspaces and
+#   $H(\mathrm{div})$ spaces.
+# * Construct a blocked linear system.
+# * Construct a block-preconditioned iterative linear solver using
+#   PETSc/petsc4y.
+# * Construct a Hypre Auxiliary Maxwell Space (AMS) preconditioner for
+#   $H(\mathrm{div})$ problems in two-dimensions.
 #
-
 # ```{admonition} Download sources
 # :class: download
-#
 # * {download}`Python script <./demo_mixed-poisson.py>`
 # * {download}`Jupyter notebook <./demo_mixed-poisson.ipynb>`
 # ```
@@ -44,15 +49,14 @@
 #   \sigma \cdot n = g \quad {\rm on} \ \Gamma_{N}.
 # $$
 #
-# The same equations arise in connection with flow in porous media, and are
-# also referred to as Darcy flow. Here $n$ denotes the outward pointing normal
-# vector on the boundary. Looking at the variational form, we see that the
-# boundary condition for the flux ($\sigma \cdot n = g$) is now an essential
-# boundary condition (which should be enforced in the function space), while
-# the other boundary condition ($u = u_0$) is a natural boundary condition
-# (which should be applied to the variational form). Inserting the boundary
-# conditions, this variational problem can be phrased in the general form: find
-# $(\sigma, u) \in \Sigma_g \times V$ such that
+# where $n$ is the outward unit normal vector on the boundary. Looking
+# at the variational form, we see that the boundary condition for the
+# flux ($\sigma \cdot n = g$) is now an essential boundary condition
+# (which should be enforced in the function space), while the other
+# boundary condition ($u = u_0$) is a natural boundary condition (which
+# should be applied to the variational form). Inserting the boundary
+# conditions, this variational problem can be phrased in the general
+# form: find $(\sigma, u) \in \Sigma_g \times V$ such that
 #
 # $$
 #    a((\sigma, u), (\tau, v)) = L((\tau, v))
@@ -62,37 +66,33 @@
 # where the variational forms $a$ and $L$ are defined as
 #
 # $$
-# \begin{align}
-#   a((\sigma, u), (\tau, v)) &=
+#   a((\sigma, u), (\tau, v)) &:=
 #     \int_{\Omega} \sigma \cdot \tau + \nabla \cdot \tau \ u
 #   + \nabla \cdot \sigma \ v \ {\rm d} x, \\
-#   L((\tau, v)) &= - \int_{\Omega} f v \ {\rm d} x
+#   L((\tau, v)) &:= - \int_{\Omega} f v \ {\rm d} x
 #   + \int_{\Gamma_D} u_0 \tau \cdot n  \ {\rm d} s,
-# \end{align}
 # $$
-# and $\Sigma_g = \{ \tau \in H({\rm div})$ such that $\tau \cdot n|_{\Gamma_N}
-# = g \}$ and $V = L^2(\Omega)$.
-#
-# To discretize the above formulation, two discrete function spaces $\Sigma_h
-# \subset \Sigma$ and $V_h \subset V$ are needed to form a mixed function space
-# $\Sigma_h \times V_h$. A stable choice of finite element spaces is to let
-# $\Sigma_h$ be the Brezzi-Douglas-Marini elements of polynomial order
-# $k$ and let $V_h$ be discontinuous elements of polynomial order $k-1$.
+# and $\Sigma_g := \{ \tau \in H({\rm div})$ such that $\tau \cdot
+# n|_{\Gamma_N} = g \}$ and $V := L^2(\Omega)$.
 #
-# We will use the same definitions of functions and boundaries as in the
-# demo for {doc}`the Poisson equation <demo_poisson>`. These are:
+# To discretize the above formulation, two discrete function spaces
+# $\Sigma_h \subset \Sigma$ and $V_h \subset V$ are needed to form a
+# mixed function space $\Sigma_h \times V_h$. A stable choice of finite
+# element spaces is to let $\Sigma_h$ be the Raviart-Thomas elements of
+# polynomial order $k$ and let $V_h$ be discontinuous Lagrange elements of
+# polynomial order $k-1$.
 #
-# * $\Omega = [0,1] \times [0,1]$ (a unit square)
-# * $\Gamma_{D} = \{(0, y) \cup (1, y) \in \partial \Omega\}$
-# * $\Gamma_{N} = \{(x, 0) \cup (x, 1) \in \partial \Omega\}$
-# * $u_0 = 0$
-# * $g = \sin(5x)$   (flux)
-# * $f = 10\exp(-((x - 0.5)^2 + (y - 0.5)^2) / 0.02)$  (source term)
+# To solve the linear system for the mixed problem, we will use am
+# iterative method with a block-diagonal preconditioner that is based on
+# the Riesz map, see for example this
+# [paper](https://doi.org/10.1002/(SICI)1099-1506(199601/02)3:1%3C1::AID-NLA67%3E3.0.CO;2-E).
+
 #
 # ## Implementation
+#
+# Import the required modules:
 
 # +
-
 try:
     from petsc4py import PETSc
 
@@ -109,88 +109,285 @@
 
 import numpy as np
 
-from basix.ufl import element, mixed_element
-from dolfinx import default_real_type, fem, io, mesh
-from dolfinx.fem.petsc import LinearProblem
-from ufl import Measure, SpatialCoordinate, TestFunctions, TrialFunctions, div, exp, inner
+import dolfinx.fem.petsc
+from basix.ufl import element
+from dolfinx import fem, la, mesh
+from dolfinx.fem.petsc import discrete_gradient, interpolation_matrix
+from dolfinx.mesh import CellType, create_unit_square
+from ufl import (
+    Measure,
+    SpatialCoordinate,
+    TestFunction,
+    TrialFunction,
+    ZeroBaseForm,
+    div,
+    exp,
+    inner,
+)
+
+# Solution scalar (e.g., float32, complex128) and geometry (float32/64)
+# types
+dtype = dolfinx.default_scalar_type
+xdtype = dolfinx.default_real_type
+# -
 
-msh = mesh.create_unit_square(MPI.COMM_WORLD, 32, 32, mesh.CellType.quadrilateral)
+# Create a two-dimensional mesh. The iterative solver constructed
+# later requires special construction that is specific to two
+# dimensions. Application in three-dimensions would require a number of
+# changes to the linear solver.
 
+# +
+msh = create_unit_square(MPI.COMM_WORLD, 96, 96, CellType.triangle, dtype=xdtype)
+# -
+#
+# Here we construct compatible function spaces for the mixed Poisson
+# problem. The `V` Raviart-Thomas ($\mathbb{RT}$) space is a
+# vector-valued $H({\rm div})$ conforming space. The `W` space is a
+# space of discontinuous Lagrange function of degree `k`.
+# ```{note}
+# The $\mathbb{RT}_{k}$ element in DOLFINx/Basix is usually denoted as
+# $\mathbb{RT}_{k-1}$ in the literature.
+# ```
+# In the lowest-order case $k=1$. It can be increased, by the
+# convergence of the iterative solver will degrade.
+
+# +
 k = 1
-Q_el = element("BDMCF", msh.basix_cell(), k, dtype=default_real_type)
-P_el = element("DG", msh.basix_cell(), k - 1, dtype=default_real_type)
-V_el = mixed_element([Q_el, P_el])
-V = fem.functionspace(msh, V_el)
+V = fem.functionspace(msh, element("RT", msh.basix_cell(), k, dtype=xdtype))
+W = fem.functionspace(msh, element("Discontinuous Lagrange", msh.basix_cell(), k - 1, dtype=xdtype))
+# -
+
+# Trial functions for $\sigma$ and $u$ are declared on the space $V$ and
+# $W$, with corresponding test functions $\tau$ and $v$:
+
+# +
+(sigma, u) = TrialFunction(V), TrialFunction(W)
+(tau, v) = TestFunction(V), TestFunction(W)
+# -
 
-(sigma, u) = TrialFunctions(V)
-(tau, v) = TestFunctions(V)
+# The source function is set to be $f = 10\exp(-((x_{0} - 0.5)^2 +
+# (x_{1} - 0.5)^2) / 0.02)$:
 
+# +
 x = SpatialCoordinate(msh)
-f = 10.0 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+f = 10 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+# -
+
+# We now declare the blocked bilinear and linear forms. The rows of `a`
+# and `L` correspond to the $\tau$ and $v$ test functions, respectively.
+# The columns of `a` correspond to the $\sigma$ and $u$ trial functions,
+# respectively. Note that `a[1][1]` is empty, which is denoted by
+# `None`. This zero block is typical of a saddle-point problem. In the
+# `L[0]` block, the test function $\tau$ is multiplied by a zero
+# `Constant`, which is evaluated at runtime. We do this to preserve
+# knowledge of the test space in the block. *Note that the defined `L`
+# corresponds to $u_{0} = 0$ on $\Gamma_{D}$.*
 
+# +
 dx = Measure("dx", msh)
-a = inner(sigma, tau) * dx + inner(u, div(tau)) * dx + inner(div(sigma), v) * dx
-L = -inner(f, v) * dx
+a = [[inner(sigma, tau) * dx, inner(u, div(tau)) * dx], [inner(div(sigma), v) * dx, None]]
+L = [ZeroBaseForm((tau,)), -inner(f, v) * dx]
+# -
+
+# We now compile the abstract/symbolic forms in `a` and `L` into
+# concrete instanced that can be assembled into matrix operators and
+# vectors, respectively.
+
+# +
+a, L = fem.form(a, dtype=dtype), fem.form(L, dtype=dtype)
+# -
 
-# Get subspace of V
-V0 = V.sub(0)
+# In preparation for Dirichlet boundary conditions, we use the function
+# `locate_entities_boundary` to locate mesh entities (facets) with which
+# degree-of-freedoms to be constrained are associated with, and then use
+# `locate_dofs_topological` to get the  degree-of-freedom indices. Below
+# we identify the degree-of-freedom in `V` on the (i) top ($x_{1} = 1$)
+# and (ii) bottom ($x_{1} = 0$) of the mesh/domain.
 
+# +
 fdim = msh.topology.dim - 1
 facets_top = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 1.0))
-Q, _ = V0.collapse()
-dofs_top = fem.locate_dofs_topological((V0, Q), fdim, facets_top)
+facets_bottom = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 0.0))
+dofs_top = fem.locate_dofs_topological(V, fdim, facets_top)
+dofs_bottom = fem.locate_dofs_topological(V, fdim, facets_bottom)
+# -
 
+# Now, we create Dirichlet boundary objects for the condition $\sigma
+# \cdot n = \sin(5 x_(0)$ on the top and bottom boundaries:
 
-def f1(x):
-    values = np.zeros((2, x.shape[1]))
-    values[1, :] = np.sin(5 * x[0])
-    return values
+# +
+cells_top_ = mesh.compute_incident_entities(msh.topology, facets_top, fdim, fdim + 1)
+cells_bottom = mesh.compute_incident_entities(msh.topology, facets_bottom, fdim, fdim + 1)
+g = fem.Function(V, dtype=dtype)
+g.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), np.sin(5 * x[0]))), cells0=cells_top_)
+g.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), -np.sin(5 * x[0]))), cells0=cells_bottom)
+bcs = [fem.dirichletbc(g, dofs_top), fem.dirichletbc(g, dofs_bottom)]
+# -
+
+# Assemble the matrix operator `A` into a PETSc 'nested matrix', zero
+# rows and columns associated with a Dirichlet boundary condition and
+# placing 1 on the diagonal for Dirichlet constrained
+# degrees-of-freedom:
 
+# +
+A = fem.petsc.assemble_matrix_nest(a, bcs=bcs)
+A.assemble()
+# -
 
-f_h1 = fem.Function(Q)
-f_h1.interpolate(f1)
-bc_top = fem.dirichletbc(f_h1, dofs_top, V0)
+# Assemble the RHS vector as a 'nested' vector and modify (apply
+# lifting) to account for the effect non-zero Dirichlet boundary
+# conditions. Then set Dirichlet boundary values in the RHS vector `b`:
 
+# +
+b = fem.petsc.assemble_vector_nest(L)
+fem.petsc.apply_lifting_nest(b, a, bcs=bcs)
+for b_sub in b.getNestSubVecs():
+    b_sub.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
+
+bcs0 = fem.bcs_by_block(fem.extract_function_spaces(L), bcs)
+fem.petsc.set_bc_nest(b, bcs0)
+# -
+
+# Rather than solving the linear system $A x = b$, we will solve the
+# preconditioned problem $P^{-1} A x = P^{-1} b$. Commonly $P = A$, but
+# this does not lead to efficient solvers for saddle point problems.
+#
+# For this problem, we introduce the preconditioner
+# $$
+# a_p((\sigma, u), (\tau, v))
+# = \begin{bmatrix} \int_{\Omega} \sigma \cdot \tau + (\nabla \cdot
+# \sigma) (\nabla \cdot \tau) \ {\rm d} x  & 0 \\ 0 &
+# \int_{\Omega} u \cdot v \ {\rm d} x \end{bmatrix}
+# $$
+# and assemble it into the matrix `P`:
 
-facets_bottom = mesh.locate_entities_boundary(msh, fdim, lambda x: np.isclose(x[1], 0.0))
-dofs_bottom = fem.locate_dofs_topological((V0, Q), fdim, facets_bottom)
+# +
+a_p = fem.form(
+    [[inner(sigma, tau) * dx + inner(div(sigma), div(tau)) * dx, None], [None, inner(u, v) * dx]],
+    dtype=dtype,
+)
+P = fem.petsc.assemble_matrix_nest(a_p, bcs=bcs)
+P.assemble()
+# -
 
+# We now create a PETSc Krylov solver and set the preconditioner (`P`)
+# and operator (`A`) matrices:
 
-def f2(x):
-    values = np.zeros((2, x.shape[1]))
-    values[1, :] = -np.sin(5 * x[0])
-    return values
+# +
+ksp = PETSc.KSP().create(msh.comm)
+ksp.setOperators(A, P)
+ksp.setMonitor(lambda ksp, its, rnorm: print(f"Iteration: {its}, residual: {rnorm}"))
+ksp.setType("gmres")
+ksp.setTolerances(rtol=1e-8)
+ksp.setGMRESRestart(100)
+# -
+
+# To apply different solvers/preconditioners to the blocks of `P`, we
+# set the preconditioner to be a PETSc
+# [`fieldsplit`](https://petsc.org/release/manual/ksp/#sec-block-matrices)
+# (block) type and set the 'splits' between the $\sigma$ and $u$ fields.
 
+# +
+ksp.getPC().setType("fieldsplit")
+ksp.getPC().setFieldSplitType(PETSc.PC.CompositeType.ADDITIVE)
+nested_IS = P.getNestISs()
+ksp.getPC().setFieldSplitIS(("sigma", nested_IS[0][0]), ("u", nested_IS[0][1]))
+ksp_sigma, ksp_u = ksp.getPC().getFieldSplitSubKSP()
+# -
+
+# For the $P_{11}$ block, which is the discontinuous Lagrange mass
+# matrix, we let the preconditioner be the default, which is incomplete
+# LU factorisation and which can solve the block exactly in one
+# iteration. The $P_{00}$ requires careful handling as $H({\rm div})$
+# problems require special preconditioners to be efficient.
+#
+# If PETSc has been configured with Hypre, we use the Hypre `Auxiliary
+# Maxwell Space` (AMS) algebraic multigrid preconditioner. We can use
+# AMS for this $H({\rm div})$-type problem in two-dimensions because
+# $H({\rm div})$ and $H({\rm curl})$ spaces are effectively the same in
+# two-dimensions, just rotated by $\pi/2.
 
-f_h2 = fem.Function(Q)
-f_h2.interpolate(f2)
-bc_bottom = fem.dirichletbc(f_h2, dofs_bottom, V0)
+# +
+pc_sigma = ksp_sigma.getPC()
+if PETSc.Sys().hasExternalPackage("hypre") and not np.issubdtype(dtype, np.complexfloating):
+    pc_sigma.setType("hypre")
+    pc_sigma.setHYPREType("ams")
+
+    opts = PETSc.Options()
+    opts[f"{ksp_sigma.prefix}pc_hypre_ams_cycle_type"] = 7
+    opts[f"{ksp_sigma.prefix}pc_hypre_ams_relax_times"] = 2
+
+    # Construct and set the 'discrete gradient' operator, which maps
+    # grad H1 -> H(curl), i.e. the gradient of a scalar Lagrange space
+    # to a H(curl) space
+    V_H1 = fem.functionspace(msh, element("Lagrange", msh.basix_cell(), k, dtype=xdtype))
+    V_curl = fem.functionspace(msh, element("N1curl", msh.basix_cell(), k, dtype=xdtype))
+    G = discrete_gradient(V_H1, V_curl)
+    G.assemble()
+    pc_sigma.setHYPREDiscreteGradient(G)
+
+    assert k > 0, "Element degree must be at least 1."
+    if k == 1:
+        # For the lowest order base (k=1), we can supply interpolation
+        # of the '1' vectors in the space V. Hypre can then construct
+        # the required operators from G and the '1' vectors.
+        cvec0, cvec1 = fem.Function(V), fem.Function(V)
+        cvec0.interpolate(lambda x: np.vstack((np.ones_like(x[0]), np.zeros_like(x[1]))))
+        cvec1.interpolate(lambda x: np.vstack((np.zeros_like(x[0]), np.ones_like(x[1]))))
+        pc_sigma.setHYPRESetEdgeConstantVectors(cvec0.x.petsc_vec, cvec1.x.petsc_vec, None)
+    else:
+        # For high-order spaces, we must provide the (H1)^d -> H(div)
+        # interpolation operator/matrix
+        V_H1d = fem.functionspace(msh, ("Lagrange", k, (msh.geometry.dim,)))
+        Pi = interpolation_matrix(V_H1d, V)  # (H1)^d -> H(div)
+        Pi.assemble()
+        pc_sigma.setHYPRESetInterpolations(msh.geometry.dim, None, None, Pi, None)
+
+        # High-order elements generally converge less well than the
+        # lowest-order case with algebraic multigrid, so we perform
+        # extra work at the multigrid stage
+        opts[f"{ksp_sigma.prefix}pc_hypre_ams_tol"] = 1e-12
+        opts[f"{ksp_sigma.prefix}pc_hypre_ams_max_iter"] = 3
+
+    ksp_sigma.setFromOptions()
+else:
+    # If Hypre is not available, use LU factorisation on the $P_{00}$
+    # block
+    pc_sigma.setType("lu")
+    use_superlu = PETSc.IntType == np.int64
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
+        pc_sigma.setFactorSolverType("mumps")
+    elif PETSc.Sys().hasExternalPackage("superlu_dist"):
+        pc_sigma.setFactorSolverType("superlu_dist")
+# -
+
+# We create finite element functions that will hold the $\sigma$ and $u$
+# solutions:
 
+# +
+sigma, u = fem.Function(V, dtype=dtype), fem.Function(W, dtype=dtype)
+# -
 
-bcs = [bc_top, bc_bottom]
+# Create a PETSc 'nested' vector that holds reference to the `sigma` and
+# `u` solution (degree-of-freedom) vectors and solve.
 
-problem = LinearProblem(
-    a,
-    L,
-    bcs=bcs,
-    petsc_options={
-        "ksp_type": "preonly",
-        "pc_type": "lu",
-        "pc_factor_mat_solver_type": "superlu_dist",
-    },
-)
-try:
-    w_h = problem.solve()
-except PETSc.Error as e:  # type: ignore
-    if e.ierr == 92:
-        print("The required PETSc solver/preconditioner is not available. Exiting.")
-        print(e)
-        exit(0)
-    else:
-        raise e
+# +
+x = PETSc.Vec().createNest([la.create_petsc_vector_wrap(sigma.x), la.create_petsc_vector_wrap(u.x)])
+ksp.solve(b, x)
+reason = ksp.getConvergedReason()
+assert reason > 0, f"Krylov solver has not converged {reason}."
+ksp.view()
+# -
 
-sigma_h, u_h = w_h.split()
+# We save the solution `u` in VTX format:
 
-with io.XDMFFile(msh.comm, "out_mixed_poisson/u.xdmf", "w") as file:
-    file.write_mesh(msh)
-    file.write_function(u_h)
+# +
+try:
+    from dolfinx.io import VTXWriter
+
+    u.name = "u"
+    with VTXWriter(msh.comm, "output_mixed_poisson.bp", u, "bp4") as f:
+        f.write(0.0)
+except ImportError:
+    print("ADIOS2 required for VTX output.")
+# -
diff --git a/python/demo/demo_poisson.py b/python/demo/demo_poisson.py
index 78f3aec3982..38c072211e3 100644
--- a/python/demo/demo_poisson.py
+++ b/python/demo/demo_poisson.py
@@ -190,6 +190,6 @@
     else:
         plotter.show()
 except ModuleNotFoundError:
-    print("'pyvista' is required to visualise the solution")
-    print("Install 'pyvista' with pip: 'python3 -m pip install pyvista'")
+    print("'pyvista' is required to visualise the solution.")
+    print("To install pyvista with pip: 'python3 -m pip install pyvista'.")
 # -
diff --git a/python/demo/demo_stokes.py b/python/demo/demo_stokes.py
index bf89e14fc4f..599063f4fc5 100644
--- a/python/demo/demo_stokes.py
+++ b/python/demo/demo_stokes.py
@@ -116,7 +116,7 @@
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_rectangle, locate_entities_boundary
-from ufl import div, dx, grad, inner
+from ufl import ZeroBaseForm, div, dx, grad, inner
 
 # We create a {py:class}`Mesh <dolfinx.mesh.Mesh>`, define functions for
 # locating geometrically subsets of the boundary, and define a function
@@ -177,7 +177,7 @@ def lid_velocity_expression(x):
 # -
 
 # The bilinear and linear forms for the Stokes equations are defined
-# using a a blocked structure:
+# using a blocked structure:
 
 # +
 # Define variational problem
@@ -186,7 +186,7 @@ def lid_velocity_expression(x):
 f = Constant(msh, (PETSc.ScalarType(0), PETSc.ScalarType(0)))  # type: ignore
 
 a = form([[inner(grad(u), grad(v)) * dx, inner(p, div(v)) * dx], [inner(div(u), q) * dx, None]])
-L = form([inner(f, v) * dx, inner(Constant(msh, PETSc.ScalarType(0)), q) * dx])  # type: ignore
+L = form([inner(f, v) * dx, ZeroBaseForm((q,))])
 # -
 
 # A block-diagonal preconditioner will be used with the iterative
@@ -440,9 +440,8 @@ def block_direct_solver():
     # handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    sys = PETSc.Sys()  # type: ignore
     use_superlu = PETSc.IntType == np.int64
-    if sys.hasExternalPackage("mumps") and not use_superlu:
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
         pc.setFactorSolverType("mumps")
         pc.setFactorSetUpSolverType()
         pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
@@ -527,9 +526,8 @@ def mixed_direct():
     # Configure MUMPS to handle pressure nullspace
     pc = ksp.getPC()
     pc.setType("lu")
-    sys = PETSc.Sys()  # type: ignore
     use_superlu = PETSc.IntType == np.int64
-    if sys.hasExternalPackage("mumps") and not use_superlu:
+    if PETSc.Sys().hasExternalPackage("mumps") and not use_superlu:
         pc.setFactorSolverType("mumps")
         pc.setFactorSetUpSolverType()
         pc.getFactorMatrix().setMumpsIcntl(icntl=24, ival=1)
diff --git a/python/dolfinx/fem/dofmap.py b/python/dolfinx/fem/dofmap.py
index 0355a2cd860..b8a848170dc 100644
--- a/python/dolfinx/fem/dofmap.py
+++ b/python/dolfinx/fem/dofmap.py
@@ -8,10 +8,10 @@
 
 
 class DofMap:
-    """Degree-of-freedom map
+    """Degree-of-freedom map.
 
-    This class handles the mapping of degrees of freedom. It builds
-    a dof map based on a FiniteElement on a specific mesh.
+    This class handles the mapping of degrees of freedom. It builds a
+    dof map based on a FiniteElement on a specific mesh.
     """
 
     _cpp_object: _cpp.fem.DofMap
@@ -26,13 +26,14 @@ def cell_dofs(self, cell_index: int):
             cell: The cell index.
 
         Returns:
-            Local-global dof map for the cell (using process-local indices).
+            Local-global dof map for the cell (using process-local
+            indices).
         """
         return self._cpp_object.cell_dofs(cell_index)
 
     @property
     def bs(self):
-        """Returns the block size of the dofmap"""
+        """Block size of the dofmap."""
         return self._cpp_object.bs
 
     @property
diff --git a/python/dolfinx/fem/forms.py b/python/dolfinx/fem/forms.py
index bb96eae9261..85ed796d011 100644
--- a/python/dolfinx/fem/forms.py
+++ b/python/dolfinx/fem/forms.py
@@ -1,4 +1,5 @@
-# Copyright (C) 2017-2024 Chris N. Richardson, Garth N. Wells, Michal Habera and Jørgen S. Dokken
+# Copyright (C) 2017-2024 Chris N. Richardson, Garth N. Wells,
+# Michal Habera and Jørgen S. Dokken
 #
 # This file is part of DOLFINx (https://www.fenicsproject.org)
 #
@@ -116,17 +117,17 @@ def get_integration_domains(
 ) -> list[tuple[int, np.ndarray]]:
     """Get integration domains from subdomain data.
 
-    The subdomain data is a meshtags object consisting of markers, or a
-    None object. If it is a None object we do not pack any integration
+    The subdomain data is a MeshTags object consisting of markers, or
+    ``None``. If it is ``None``, we do not pack any integration
     entities. Integration domains are defined as a list of tuples, where
-    each input `subdomain_ids` is mapped to an array of integration
+    each input ``subdomain_ids`` is mapped to an array of integration
     entities, where an integration entity for a cell integral is the
     list of cells. For an exterior facet integral each integration
-    entity is a tuple (cell_index, local_facet_index). For an interior
-    facet integral each integration entity is a uple (cell_index0,
-    local_facet_index0, cell_index1, local_facet_index1). Where the
-    first cell-facet pair is the '+' restriction, the second the '-'
-    restriction.
+    entity is a tuple ``(cell_index, local_facet_index)``. For an
+    interior facet integral each integration entity is a tuple
+    ``(cell_index0, local_facet_index0, cell_index1,
+    local_facet_index1)``. Where the first cell-facet pair is the
+    ``'+'`` restriction, the second the ``'-'`` restriction.
 
     Args:
         integral_type: The type of integral to pack integration
@@ -216,9 +217,9 @@ def form(
             For each key (a mesh, different to the integration domain
             mesh) a map should be provided relating the entities in the
             integration domain mesh to the entities in the key mesh e.g.
-            for a key-value pair (msh, emap) in `entity_maps`, `emap[i]`
-            is the entity in `msh` corresponding to entity `i` in the
-            integration domain mesh.
+            for a key-value pair ``(msh, emap)`` in ``entity_maps``,
+            ``emap[i]`` is the entity in ``msh`` corresponding to entity
+            ``i`` in the integration domain mesh.
 
     Returns:
         Compiled finite element Form.
@@ -246,11 +247,11 @@ def _form(form):
         for data in sd.get(domain).values():
             assert all([d is data[0] for d in data if d is not None])
 
-        mesh = domain.ufl_cargo()
-        if mesh is None:
+        msh = domain.ufl_cargo()
+        if msh is None:
             raise RuntimeError("Expecting to find a Mesh in the form.")
         ufcx_form, module, code = jit.ffcx_jit(
-            mesh.comm, form, form_compiler_options=form_compiler_options, jit_options=jit_options
+            msh.comm, form, form_compiler_options=form_compiler_options, jit_options=jit_options
         )
 
         # For each argument in form extract its function space
@@ -308,10 +309,28 @@ def _form(form):
             constants,
             subdomains,
             _entity_maps,
-            mesh,
+            msh,
         )
         return Form(f, ufcx_form, code, module)
 
+    def _zero_form(form):
+        """Compile a single 'zero' UFL form, i.e. a form with no integrals."""
+        V = [arg.ufl_function_space()._cpp_object for arg in form.arguments()]
+        if entity_maps is None:
+            _entity_maps = dict()
+        else:
+            _entity_maps = {msh._cpp_object: emap for (msh, emap) in entity_maps.items()}
+        f = ftype(
+            spaces=V,
+            integrals={},
+            coefficients=[],
+            constants=[],
+            need_permutation_data=False,
+            entity_maps=_entity_maps,
+            mesh=None,
+        )
+        return Form(f)
+
     def _create_form(form):
         """Recursively convert ufl.Forms to dolfinx.fem.Form.
 
@@ -323,10 +342,9 @@ def _create_form(form):
             A ``dolfinx.fem.Form`` or a list of ``dolfinx.fem.Form``.
         """
         if isinstance(form, ufl.Form):
-            if form.empty():
-                return None
-            else:
-                return _form(form)
+            return _form(form)
+        elif isinstance(form, ufl.ZeroBaseForm):
+            return _zero_form(form)
         elif isinstance(form, collections.abc.Iterable):
             return list(map(lambda sub_form: _create_form(sub_form), form))
         else:
@@ -344,11 +362,11 @@ def extract_function_spaces(
 ) -> typing.Iterable[typing.Union[None, function.FunctionSpace]]:
     """Extract common function spaces from an array of forms.
 
-    If `forms` is a list of linear form, this function returns of list
-    of the corresponding test functions. If `forms` is a 2D array of
-    bilinear forms, for index=0 the list common test function space for
-    each row is returned, and if index=1 the common trial function
-    spaces for each column are returned.
+    If ``forms`` is a list of linear form, this function returns of list
+    of the corresponding test functions. If ``forms`` is a 2D array of
+    bilinear forms, for ``index=0`` the list common test function space
+    for each row is returned, and if ``index=1`` the common trial
+    function spaces for each column are returned.
     """
     _forms = np.array(forms)
     if _forms.ndim == 0:
@@ -456,7 +474,7 @@ def form_cpp_creator(
 def create_form(
     form: CompiledForm,
     function_spaces: list[function.FunctionSpace],
-    mesh: Mesh,
+    msh: Mesh,
     subdomains: dict[IntegralType, list[tuple[int, np.ndarray]]],
     coefficient_map: dict[ufl.Function, function.Function],
     constant_map: dict[ufl.Constant, function.Constant],
@@ -469,7 +487,7 @@ def create_form(
         form: Compiled ufl form
         function_spaces: List of function spaces associated with the
             form. Should match the number of arguments in the form.
-        mesh: Mesh to associate form with
+        msh: Mesh to associate form with
         subdomains: A map from integral type to a list of pairs, where
             each pair corresponds to a subdomain id and the set of of
             integration entities to integrate over. Can be computed with
@@ -477,9 +495,9 @@ def create_form(
         coefficient_map: Map from UFL coefficient to function with data.
         constant_map: Map from UFL constant to constant with data.
         entity_map: A map where each key corresponds to a mesh different
-            to the integration domain `mesh`. The value of the map is an
-            array of integers, where the i-th entry is the entity in the
-            key mesh.
+            to the integration domain ``msh``. The value of the map is
+            an array of integers, where the i-th entry is the entity in
+            the key mesh.
     """
     if entity_maps is None:
         _entity_maps = {}
@@ -506,6 +524,6 @@ def create_form(
         constants,
         _subdomain_data,
         _entity_maps,
-        mesh._cpp_object,
+        msh._cpp_object,
     )
     return Form(f, form.ufcx_form, form.code)
diff --git a/python/dolfinx/fem/function.py b/python/dolfinx/fem/function.py
index 44c1f740bb2..8c8a3277ea3 100644
--- a/python/dolfinx/fem/function.py
+++ b/python/dolfinx/fem/function.py
@@ -316,9 +316,9 @@ def __init__(
             if dtype is None:
                 dtype = default_scalar_type
 
-        assert np.issubdtype(
-            V.element.dtype, np.dtype(dtype).type(0).real.dtype
-        ), "Incompatible FunctionSpace dtype and requested dtype."
+        assert np.issubdtype(V.element.dtype, np.dtype(dtype).type(0).real.dtype), (
+            "Incompatible FunctionSpace dtype and requested dtype."
+        )
 
         # Create cpp Function
         def functiontype(dtype):
@@ -592,9 +592,9 @@ def functionspace(
     element = finiteelement(mesh.topology.cell_type, ufl_e, dtype)
     cpp_dofmap = _cpp.fem.create_dofmap(mesh.comm, mesh.topology._cpp_object, element._cpp_object)
 
-    assert np.issubdtype(
-        mesh.geometry.x.dtype, element.dtype
-    ), "Mesh and element dtype are not compatible."
+    assert np.issubdtype(mesh.geometry.x.dtype, element.dtype), (
+        "Mesh and element dtype are not compatible."
+    )
 
     # Initialize the cpp.FunctionSpace
     try:
@@ -667,11 +667,6 @@ def num_sub_spaces(self) -> int:
         """Number of sub spaces."""
         return self.element.num_sub_elements
 
-    # @property
-    # def value_shape(self) -> tuple[int, ...]:
-    #     """Value shape."""
-    #     return tuple(int(i) for i in self._cpp_object.value_shape)
-
     def sub(self, i: int) -> FunctionSpace:
         """Return the i-th sub space.
 
diff --git a/python/dolfinx/fem/petsc.py b/python/dolfinx/fem/petsc.py
index 9771371c6d8..4305c004ab0 100644
--- a/python/dolfinx/fem/petsc.py
+++ b/python/dolfinx/fem/petsc.py
@@ -143,7 +143,8 @@ def create_vector_block(L: list[Form]) -> PETSc.Vec:
 
 
 def create_vector_nest(L: list[Form]) -> PETSc.Vec:
-    """Create a PETSc nested vector (``VecNest``) that is compatible with a list of linear forms.
+    """Create a PETSc nested vector (``VecNest``) that is compatible
+    with a list of linear forms.
 
     Args:
         L: List of linear forms.
@@ -285,14 +286,16 @@ def _assemble_vector_vec(b: PETSc.Vec, L: Form, constants=None, coeffs=None) ->
 def assemble_vector_nest(L: typing.Any, constants=None, coeffs=None) -> PETSc.Vec:
     """Assemble linear forms into a new nested PETSc (``VecNest``) vector.
 
-    Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Vec.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Vec.destroy()`` is collective over the object's MPI communicator.
-
     The returned vector is not finalised, i.e. ghost values are not
     accumulated on the owning processes.
+
+    Note:
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Vec.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Vec.destroy()`` is collective over the object's MPI
+        communicator.
     """
     maps = [
         (form.function_spaces[0].dofmap.index_map, form.function_spaces[0].dofmap.index_map_bs)
@@ -337,13 +340,15 @@ def assemble_vector_block(
 ) -> PETSc.Vec:
     """Assemble linear forms into a monolithic vector.
 
-    Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Vec.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Vec.destroy()`` is collective over the object's MPI communicator.
-
     The vector is not finalised, i.e. ghost values are not accumulated.
+
+    Note:
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Vec.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Vec.destroy()`` is collective over the object's MPI
+        communicator.
     """
     maps = [
         (form.function_spaces[0].dofmap.index_map, form.function_spaces[0].dofmap.index_map_bs)
@@ -460,10 +465,12 @@ def assemble_matrix(
         have not been communicated.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Bilinear form to assembled into a matrix.
@@ -520,10 +527,12 @@ def assemble_matrix_nest(
     """Create a nested matrix and assemble bilinear forms into the matrix.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
 
     Args:
         a: Rectangular (list-of-lists) array for bilinear forms.
@@ -615,11 +624,12 @@ def assemble_matrix_block(
     """Assemble bilinear forms into a blocked matrix.
 
     Note:
-        Due to subtle issues in the interaction between petsc4py memory management
-        and the Python garbage collector, it is recommended that the method ``PETSc.Mat.destroy()``
-        is called on the returned object once the object is no longer required. Note that
-        ``PETSc.Mat.destroy()`` is collective over the object's MPI communicator.
-
+        Due to subtle issues in the interaction between petsc4py memory
+        management and the Python garbage collector, it is recommended
+        that the method ``PETSc.Mat.destroy()`` is called on the
+        returned object once the object is no longer required. Note that
+        ``PETSc.Mat.destroy()`` is collective over the object's MPI
+        communicator.
     """
     _a = [[None if form is None else form._cpp_object for form in arow] for arow in a]
     A = _cpp.fem.petsc.create_matrix_block(_a)
@@ -777,7 +787,9 @@ def set_bc_nest(
     x0: typing.Optional[PETSc.Vec] = None,
     alpha: float = 1,
 ) -> None:
-    """Apply the function :func:`dolfinx.fem.set_bc` to each sub-vector of a nested PETSc Vector."""
+    """Apply the function :func:`dolfinx.fem.set_bc` to each sub-vector
+    of a nested PETSc Vector.
+    """
     _b = b.getNestSubVecs()
     x0 = len(_b) * [None] if x0 is None else x0.getNestSubVecs()
     for b_sub, bc, x_sub in zip(_b, bcs, x0):
@@ -962,8 +974,8 @@ def __init__(
                 command line to see all available options.
             jit_options: Options used in CFFI JIT compilation of C
                 code generated by FFCx. See ``python/dolfinx/jit.py``
-                for all available options. Takes priority over all
-                other option values.
+                for all available options. Takes priority over all other
+                option values.
 
         Example::
 
diff --git a/python/dolfinx/graph.py b/python/dolfinx/graph.py
index ebc297ba000..5ad919a1fa2 100644
--- a/python/dolfinx/graph.py
+++ b/python/dolfinx/graph.py
@@ -48,6 +48,9 @@ def __init__(self, cpp_object: Union[_cpp.la.AdjacencyList_int32, _cpp.la.Adjace
         """
         self._cpp_object = cpp_object
 
+    def __repr__(self):
+        return self._cpp_object.__repr__
+
     def links(self, node: Union[np.int32, np.int64]) -> npt.NDArray[Union[np.int32, np.int64]]:
         """Retrieve the links of a node.
 
diff --git a/python/dolfinx/mesh.py b/python/dolfinx/mesh.py
index 9439faa5521..89a1ff69f21 100644
--- a/python/dolfinx/mesh.py
+++ b/python/dolfinx/mesh.py
@@ -366,7 +366,7 @@ def ufl_id(self) -> int:
 
     @property
     def topology(self) -> _cpp.mesh.Topology:
-        """Mesh topology with which the the tags are associated."""
+        """Mesh topology with which the tags are associated."""
         return self._cpp_object.topology
 
     @property
diff --git a/python/dolfinx/plot.py b/python/dolfinx/plot.py
index ef90cb1c2ca..f92499de55f 100644
--- a/python/dolfinx/plot.py
+++ b/python/dolfinx/plot.py
@@ -103,7 +103,8 @@ def _(V: fem.FunctionSpace, entities=None):
         "P",
     ]:
         raise RuntimeError(
-            "Can only create meshes from continuous or discontinuous Lagrange spaces"
+            "Can only create meshes from continuous or discontinuous Lagrange"
+            "spaces, not {V.ufl_element().family_name}."
         )
 
     degree = V.ufl_element().degree
diff --git a/python/test/conftest.py b/python/test/conftest.py
index b55c9a144d9..36a699d0868 100644
--- a/python/test/conftest.py
+++ b/python/test/conftest.py
@@ -192,7 +192,7 @@ def _global_dot(comm, v0, v1):
             p.array[:nr] = beta * p.array[:nr] + r
 
         raise RuntimeError(
-            f"Solver exceeded max iterations ({maxit}). Relative residual={rnorm/rnorm0}."
+            f"Solver exceeded max iterations ({maxit}). Relative residual={rnorm / rnorm0}."
         )
 
     return _cg
diff --git a/python/test/unit/fem/test_assembler.py b/python/test/unit/fem/test_assembler.py
index dc4b9f2efc8..7459af0b54c 100644
--- a/python/test/unit/fem/test_assembler.py
+++ b/python/test/unit/fem/test_assembler.py
@@ -333,9 +333,7 @@ def test_matrix_assembly_block(self, mode):
         # Define variational problem
         u, p, r = ufl.TrialFunction(V0), ufl.TrialFunction(V1), ufl.TrialFunction(V2)
         v, q, s = ufl.TestFunction(V0), ufl.TestFunction(V1), ufl.TestFunction(V2)
-        f = 1.0
         g = -3.0
-        zero = Function(V0)
 
         a00 = inner(u, v) * dx
         a01 = inner(p, v) * dx
@@ -347,7 +345,7 @@ def test_matrix_assembly_block(self, mode):
         a21 = inner(p, s) * dx
         a22 = inner(r, s) * dx
 
-        L0 = zero * inner(f, v) * dx
+        L0 = ufl.ZeroBaseForm((v,))
         L1 = inner(g, q) * dx
         L2 = inner(g, s) * dx
 
@@ -409,7 +407,7 @@ def monolithic():
                 + inner(u2, v0) * dx
                 + inner(u2, v1) * dx
             )
-            L = zero * inner(f, v0) * ufl.dx + inner(g, v1) * dx + inner(g, v2) * dx
+            L = ufl.ZeroBaseForm((v0,)) + inner(g, v1) * dx + inner(g, v2) * dx
             a, L = form(a), form(L)
 
             bdofsW_V1 = locate_dofs_topological(W.sub(1), mesh.topology.dim - 1, bndry_facets)
@@ -476,11 +474,10 @@ def test_assembly_solve_block(self, mode):
         v, q = ufl.TestFunction(V0), ufl.TestFunction(V1)
         f = 1.0
         g = -3.0
-        zero = Function(V0)
 
         a00 = form(inner(u, v) * dx)
-        a01 = form(zero * inner(p, v) * dx)
-        a10 = form(zero * inner(u, q) * dx)
+        a01 = None
+        a10 = None
         a11 = form(inner(p, q) * dx)
         L0 = form(inner(f, v) * dx)
         L1 = form(inner(g, q) * dx)
@@ -654,12 +651,10 @@ def boundary1(x):
         p01, p10 = None, None
         p11 = inner(p, q) * dx
 
-        # FIXME
-        # We need zero function for the 'zero' part of L
-        p_zero = Function(P1)
+        # We need a 'zero' form for the 'zero' part of L
         f = Function(P2)
         L0 = ufl.inner(f, v) * dx
-        L1 = ufl.inner(p_zero, q) * dx
+        L1 = ufl.ZeroBaseForm((q,))
 
         def nested_solve():
             """Nested solver"""
@@ -759,9 +754,8 @@ def monolithic_solve():
             p_form = p00 + p11
 
             f = Function(W.sub(0).collapse()[0])
-            p_zero = Function(W.sub(1).collapse()[0])
             L0 = inner(f, v) * dx
-            L1 = inner(p_zero, q) * dx
+            L1 = ufl.ZeroBaseForm((q,))
             L = L0 + L1
             a, p_form, L = form(a), form(p_form), form(L)
 

From 22a8ec26636e548e9fbe372d015f7c6c96ab79f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20T=2E=20K=C3=BChner?=
 <56360279+schnellerhase@users.noreply.github.com>
Date: Sat, 11 Jan 2025 17:03:35 +0100
Subject: [PATCH 2/3] Fix inconsistent ufl usage in demos (#3532)

* Fix inconsitent ufl import in elasticity example

* Fix inconsistent ufl import in biharmonic example

* Fix inconsistent ufl import in cahn-hilliard example

* Fix inconsistent ufl import in hdg example

* Fix inconsistent ufl import in Helmholtz example

* Fix inconsistent ufl import in Lagrange variants example

* Fix inconsistent ufl import in poisson matrix free example

* Fix inconsistent ufl import in poisson example

* Fix inconsistent ufl import in pyamg example

* Fix inconsistent ufl import in stokes example

* Fix lagrange variants

* Add 'ufl' banned import from

* Switch to consistent 'import ufl' usage in demos

* New ruff version

* Remove comment
---
 python/demo/demo_biharmonic.py          | 17 +++--
 python/demo/demo_cahn-hilliard.py       | 15 ++--
 python/demo/demo_elasticity.py          |  9 ++-
 python/demo/demo_hdg.py                 | 15 ++--
 python/demo/demo_helmholtz.py           | 21 +++---
 python/demo/demo_lagrange_variants.py   |  5 +-
 python/demo/demo_mixed-poisson.py       | 33 ++++-----
 python/demo/demo_navier-stokes.py       | 93 +++++++++++--------------
 python/demo/demo_poisson.py             |  5 +-
 python/demo/demo_poisson_matrix_free.py |  9 ++-
 python/demo/demo_pyamg.py               | 13 ++--
 python/demo/demo_stokes.py              | 19 +++--
 python/demo/demo_tnt-elements.py        | 16 ++---
 python/demo/pyproject.toml              |  5 ++
 14 files changed, 139 insertions(+), 136 deletions(-)
 create mode 100644 python/demo/pyproject.toml

diff --git a/python/demo/demo_biharmonic.py b/python/demo/demo_biharmonic.py
index bf8f0d9c9bc..4ed85951321 100644
--- a/python/demo/demo_biharmonic.py
+++ b/python/demo/demo_biharmonic.py
@@ -131,7 +131,6 @@
 from dolfinx import fem, io, mesh, plot
 from dolfinx.fem.petsc import LinearProblem
 from dolfinx.mesh import CellType, GhostMode
-from ufl import CellDiameter, FacetNormal, avg, div, dS, dx, grad, inner, jump, pi, sin
 
 # -
 
@@ -192,8 +191,8 @@
 # sides of a facet.
 
 alpha = ScalarType(8.0)
-h = CellDiameter(msh)
-n = FacetNormal(msh)
+h = ufl.CellDiameter(msh)
+n = ufl.FacetNormal(msh)
 h_avg = (h("+") + h("-")) / 2.0
 
 # After that, we can define the variational problem consisting of the bilinear
@@ -210,15 +209,15 @@
 u = ufl.TrialFunction(V)
 v = ufl.TestFunction(V)
 x = ufl.SpatialCoordinate(msh)
-f = 4.0 * pi**4 * sin(pi * x[0]) * sin(pi * x[1])
+f = 4.0 * ufl.pi**4 * ufl.sin(ufl.pi * x[0]) * ufl.sin(ufl.pi * x[1])
 
 a = (
-    inner(div(grad(u)), div(grad(v))) * dx
-    - inner(avg(div(grad(u))), jump(grad(v), n)) * dS
-    - inner(jump(grad(u), n), avg(div(grad(v)))) * dS
-    + alpha / h_avg * inner(jump(grad(u), n), jump(grad(v), n)) * dS
+    ufl.inner(ufl.div(ufl.grad(u)), ufl.div(ufl.grad(v))) * ufl.dx
+    - ufl.inner(ufl.avg(ufl.div(ufl.grad(u))), ufl.jump(ufl.grad(v), n)) * ufl.dS
+    - ufl.inner(ufl.jump(ufl.grad(u), n), ufl.avg(ufl.div(ufl.grad(v)))) * ufl.dS
+    + alpha / h_avg * ufl.inner(ufl.jump(ufl.grad(u), n), ufl.jump(ufl.grad(v), n)) * ufl.dS
 )
-L = inner(f, v) * dx
+L = ufl.inner(f, v) * ufl.dx
 # -
 
 # We create a {py:class}`LinearProblem <dolfinx.fem.petsc.LinearProblem>`
diff --git a/python/demo/demo_cahn-hilliard.py b/python/demo/demo_cahn-hilliard.py
index 3cfed7d48e8..85e7aa45e91 100644
--- a/python/demo/demo_cahn-hilliard.py
+++ b/python/demo/demo_cahn-hilliard.py
@@ -145,7 +145,6 @@
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_unit_square
 from dolfinx.nls.petsc import NewtonSolver
-from ufl import dx, grad, inner
 
 try:
     import pyvista as pv
@@ -200,7 +199,7 @@
 c0, mu0 = ufl.split(u0)
 # -
 
-# The line `c, mu = ufl.split(u)` permits direct access to the
+# The line `c, mu = split(u)` permits direct access to the
 # components of a mixed function. Note that `c` and `mu` are references
 # for components of `u`, and not copies.
 #
@@ -249,8 +248,16 @@
 # which is then used in the definition of the variational forms:
 
 # Weak statement of the equations
-F0 = inner(c, q) * dx - inner(c0, q) * dx + dt * inner(grad(mu_mid), grad(q)) * dx
-F1 = inner(mu, v) * dx - inner(dfdc, v) * dx - lmbda * inner(grad(c), grad(v)) * dx
+F0 = (
+    ufl.inner(c, q) * ufl.dx
+    - ufl.inner(c0, q) * ufl.dx
+    + dt * ufl.inner(ufl.grad(mu_mid), ufl.grad(q)) * ufl.dx
+)
+F1 = (
+    ufl.inner(mu, v) * ufl.dx
+    - ufl.inner(dfdc, v) * ufl.dx
+    - lmbda * ufl.inner(ufl.grad(c), ufl.grad(v)) * ufl.dx
+)
 F = F0 + F1
 
 # This is a statement of the time-discrete equations presented as part
diff --git a/python/demo/demo_elasticity.py b/python/demo/demo_elasticity.py
index 8301667b096..fbca3dbdad7 100644
--- a/python/demo/demo_elasticity.py
+++ b/python/demo/demo_elasticity.py
@@ -53,7 +53,6 @@
 from dolfinx.fem.petsc import apply_lifting, assemble_matrix, assemble_vector
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, GhostMode, create_box, locate_entities_boundary
-from ufl import dx, grad, inner
 
 dtype = PETSc.ScalarType  # type: ignore
 # -
@@ -135,7 +134,7 @@ def build_nullspace(V: FunctionSpace):
 
 def σ(v):
     """Return an expression for the stress σ given a displacement field"""
-    return 2.0 * μ * ufl.sym(grad(v)) + λ * ufl.tr(ufl.sym(grad(v))) * ufl.Identity(len(v))
+    return 2.0 * μ * ufl.sym(ufl.grad(v)) + λ * ufl.tr(ufl.sym(ufl.grad(v))) * ufl.Identity(len(v))
 
 
 # -
@@ -146,8 +145,8 @@ def σ(v):
 
 V = functionspace(msh, ("Lagrange", 1, (msh.geometry.dim,)))
 u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-a = form(inner(σ(u), grad(v)) * dx)
-L = form(inner(f, v) * dx)
+a = form(ufl.inner(σ(u), ufl.grad(v)) * ufl.dx)
+L = form(ufl.inner(f, v) * ufl.dx)
 
 # A homogeneous (zero) boundary condition is created on $x_0 = 0$ and
 # $x_1 = 1$ by finding all facets on these boundaries, and then creating
@@ -240,7 +239,7 @@ def σ(v):
 
 # +
 sigma_dev = σ(uh) - (1 / 3) * ufl.tr(σ(uh)) * ufl.Identity(len(uh))
-sigma_vm = ufl.sqrt((3 / 2) * inner(sigma_dev, sigma_dev))
+sigma_vm = ufl.sqrt((3 / 2) * ufl.inner(sigma_dev, sigma_dev))
 # -
 
 # Next, the Von Mises stress is interpolated in a piecewise-constant
diff --git a/python/demo/demo_hdg.py b/python/demo/demo_hdg.py
index 8e0e54a6869..c40ef304d3e 100644
--- a/python/demo/demo_hdg.py
+++ b/python/demo/demo_hdg.py
@@ -41,7 +41,6 @@
 import ufl
 from dolfinx import fem, mesh
 from dolfinx.cpp.mesh import cell_num_entities
-from ufl import div, dot, grad, inner
 
 
 def par_print(comm, string):
@@ -147,17 +146,17 @@ def u_e(x):
 x = ufl.SpatialCoordinate(msh)
 c = 1.0 + 0.1 * ufl.sin(ufl.pi * x[0]) * ufl.sin(ufl.pi * x[1])
 a = (
-    inner(c * grad(u), grad(v)) * dx_c
-    - inner(c * (u - ubar), dot(grad(v), n)) * ds_c(cell_boundaries)
-    - inner(dot(grad(u), n), c * (v - vbar)) * ds_c(cell_boundaries)
-    + gamma * inner(c * (u - ubar), v - vbar) * ds_c(cell_boundaries)
+    ufl.inner(c * ufl.grad(u), ufl.grad(v)) * dx_c
+    - ufl.inner(c * (u - ubar), ufl.dot(ufl.grad(v), n)) * ds_c(cell_boundaries)
+    - ufl.inner(ufl.dot(ufl.grad(u), n), c * (v - vbar)) * ds_c(cell_boundaries)
+    + gamma * ufl.inner(c * (u - ubar), v - vbar) * ds_c(cell_boundaries)
 )
 
 # Manufacture a source term
-f = -div(c * grad(u_e(x)))
+f = -ufl.div(c * ufl.grad(u_e(x)))
 
-L = inner(f, v) * dx_c
-L += inner(fem.Constant(facet_mesh, dtype(0.0)), vbar) * dx_f
+L = ufl.inner(f, v) * dx_c
+L += ufl.inner(fem.Constant(facet_mesh, dtype(0.0)), vbar) * dx_f
 
 # Define block structure
 a_blocked = dolfinx.fem.form(ufl.extract_blocks(a), entity_maps=entity_maps)
diff --git a/python/demo/demo_helmholtz.py b/python/demo/demo_helmholtz.py
index 0c9ae774bd2..051ba18b2bd 100644
--- a/python/demo/demo_helmholtz.py
+++ b/python/demo/demo_helmholtz.py
@@ -51,7 +51,6 @@
 from dolfinx.fem.petsc import LinearProblem
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import create_unit_square
-from ufl import FacetNormal, dot, ds, dx, grad, inner
 
 # Wavenumber
 k0 = 4 * np.pi
@@ -79,14 +78,14 @@
 
 # Define variational problem:
 u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-a = inner(grad(u), grad(v)) * dx - k0**2 * inner(u, v) * dx
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx - k0**2 * ufl.inner(u, v) * ufl.dx
 
 # solve for plane wave with mixed Dirichlet and Neumann BCs
 theta = np.pi / 4
 u_exact.interpolate(lambda x: A * np.exp(1j * k0 * (np.cos(theta) * x[0] + np.sin(theta) * x[1])))
-n = FacetNormal(msh)
-g = -dot(n, grad(u_exact))
-L = -inner(g, v) * ds
+n = ufl.FacetNormal(msh)
+g = -ufl.dot(n, ufl.grad(u_exact))
+L = -ufl.inner(g, v) * ufl.ds
 
 
 dofs_D = locate_dofs_geometrical(
@@ -122,13 +121,17 @@
 
 # H1 errors
 diff = uh - u_exact
-H1_diff = msh.comm.allreduce(assemble_scalar(form(inner(grad(diff), grad(diff)) * dx)), op=MPI.SUM)
+H1_diff = msh.comm.allreduce(
+    assemble_scalar(form(ufl.inner(ufl.grad(diff), ufl.grad(diff)) * ufl.dx)), op=MPI.SUM
+)
 H1_exact = msh.comm.allreduce(
-    assemble_scalar(form(inner(grad(u_exact), grad(u_exact)) * dx)), op=MPI.SUM
+    assemble_scalar(form(ufl.inner(ufl.grad(u_exact), ufl.grad(u_exact)) * ufl.dx)), op=MPI.SUM
 )
 print("Relative H1 error of FEM solution:", abs(np.sqrt(H1_diff) / np.sqrt(H1_exact)))
 
 # L2 errors
-L2_diff = msh.comm.allreduce(assemble_scalar(form(inner(diff, diff) * dx)), op=MPI.SUM)
-L2_exact = msh.comm.allreduce(assemble_scalar(form(inner(u_exact, u_exact) * dx)), op=MPI.SUM)
+L2_diff = msh.comm.allreduce(assemble_scalar(form(ufl.inner(diff, diff) * ufl.dx)), op=MPI.SUM)
+L2_exact = msh.comm.allreduce(
+    assemble_scalar(form(ufl.inner(u_exact, u_exact) * ufl.dx)), op=MPI.SUM
+)
 print("Relative L2 error of FEM solution:", abs(np.sqrt(L2_diff) / np.sqrt(L2_exact)))
diff --git a/python/demo/demo_lagrange_variants.py b/python/demo/demo_lagrange_variants.py
index 9348aa86d29..ef7153be181 100644
--- a/python/demo/demo_lagrange_variants.py
+++ b/python/demo/demo_lagrange_variants.py
@@ -24,9 +24,8 @@
 
 import basix
 import basix.ufl
-import ufl  # type: ignore
+import ufl
 from dolfinx import default_real_type, fem, mesh
-from ufl import dx
 
 # -
 
@@ -175,7 +174,7 @@ def saw_tooth(x):
     V = fem.functionspace(msh, ufl_element)
     uh = fem.Function(V)
     uh.interpolate(lambda x: saw_tooth(x[0]))
-    M = fem.form((u_exact - uh) ** 2 * dx)
+    M = fem.form((u_exact - uh) ** 2 * ufl.dx)
     error = msh.comm.allreduce(fem.assemble_scalar(M), op=MPI.SUM)
     print(f"Computed L2 interpolation error ({variant.name}):", error**0.5)
 # -
diff --git a/python/demo/demo_mixed-poisson.py b/python/demo/demo_mixed-poisson.py
index 51086346b19..44eb2a91032 100644
--- a/python/demo/demo_mixed-poisson.py
+++ b/python/demo/demo_mixed-poisson.py
@@ -110,20 +110,11 @@
 import numpy as np
 
 import dolfinx.fem.petsc
+import ufl
 from basix.ufl import element
 from dolfinx import fem, la, mesh
 from dolfinx.fem.petsc import discrete_gradient, interpolation_matrix
 from dolfinx.mesh import CellType, create_unit_square
-from ufl import (
-    Measure,
-    SpatialCoordinate,
-    TestFunction,
-    TrialFunction,
-    ZeroBaseForm,
-    div,
-    exp,
-    inner,
-)
 
 # Solution scalar (e.g., float32, complex128) and geometry (float32/64)
 # types
@@ -161,16 +152,16 @@
 # $W$, with corresponding test functions $\tau$ and $v$:
 
 # +
-(sigma, u) = TrialFunction(V), TrialFunction(W)
-(tau, v) = TestFunction(V), TestFunction(W)
+(sigma, u) = ufl.TrialFunction(V), ufl.TrialFunction(W)
+(tau, v) = ufl.TestFunction(V), ufl.TestFunction(W)
 # -
 
 # The source function is set to be $f = 10\exp(-((x_{0} - 0.5)^2 +
 # (x_{1} - 0.5)^2) / 0.02)$:
 
 # +
-x = SpatialCoordinate(msh)
-f = 10 * exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
+x = ufl.SpatialCoordinate(msh)
+f = 10 * ufl.exp(-((x[0] - 0.5) * (x[0] - 0.5) + (x[1] - 0.5) * (x[1] - 0.5)) / 0.02)
 # -
 
 # We now declare the blocked bilinear and linear forms. The rows of `a`
@@ -184,9 +175,12 @@
 # corresponds to $u_{0} = 0$ on $\Gamma_{D}$.*
 
 # +
-dx = Measure("dx", msh)
-a = [[inner(sigma, tau) * dx, inner(u, div(tau)) * dx], [inner(div(sigma), v) * dx, None]]
-L = [ZeroBaseForm((tau,)), -inner(f, v) * dx]
+dx = ufl.Measure("dx", msh)
+a = [
+    [ufl.inner(sigma, tau) * dx, ufl.inner(u, ufl.div(tau)) * dx],
+    [ufl.inner(ufl.div(sigma), v) * dx, None],
+]
+L = [ufl.ZeroBaseForm((tau,)), -ufl.inner(f, v) * dx]
 # -
 
 # We now compile the abstract/symbolic forms in `a` and `L` into
@@ -263,7 +257,10 @@
 
 # +
 a_p = fem.form(
-    [[inner(sigma, tau) * dx + inner(div(sigma), div(tau)) * dx, None], [None, inner(u, v) * dx]],
+    [
+        [ufl.inner(sigma, tau) * dx + ufl.inner(ufl.div(sigma), ufl.div(tau)) * dx, None],
+        [None, ufl.inner(u, v) * dx],
+    ],
     dtype=dtype,
 )
 P = fem.petsc.assemble_matrix_nest(a_p, bcs=bcs)
diff --git a/python/demo/demo_navier-stokes.py b/python/demo/demo_navier-stokes.py
index a48b9aa3a77..ec5bd5d3ce7 100644
--- a/python/demo/demo_navier-stokes.py
+++ b/python/demo/demo_navier-stokes.py
@@ -179,27 +179,9 @@
 # +
 import numpy as np
 
+import ufl
 from dolfinx import default_real_type, fem, io, mesh
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
-from ufl import (
-    CellDiameter,
-    FacetNormal,
-    MixedFunctionSpace,
-    TestFunctions,
-    TrialFunctions,
-    avg,
-    conditional,
-    div,
-    dot,
-    dS,
-    ds,
-    dx,
-    extract_blocks,
-    grad,
-    gt,
-    inner,
-    outer,
-)
 
 try:
     from petsc4py import PETSc
@@ -225,15 +207,18 @@
 # +
 def norm_L2(comm, v):
     """Compute the L2(Ω)-norm of v"""
-    return np.sqrt(comm.allreduce(fem.assemble_scalar(fem.form(inner(v, v) * dx)), op=MPI.SUM))
+    return np.sqrt(
+        comm.allreduce(fem.assemble_scalar(fem.form(ufl.inner(v, v) * ufl.dx)), op=MPI.SUM)
+    )
 
 
 def domain_average(msh, v):
     """Compute the average of a function over the domain"""
     vol = msh.comm.allreduce(
-        fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * dx)), op=MPI.SUM
+        fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * ufl.dx)),
+        op=MPI.SUM,
     )
-    return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * dx)), op=MPI.SUM)
+    return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * ufl.dx)), op=MPI.SUM)
 
 
 def u_e_expr(x):
@@ -282,7 +267,7 @@ def f_expr(x):
 # Function spaces for the velocity and for the pressure
 V = fem.functionspace(msh, ("Raviart-Thomas", k + 1))
 Q = fem.functionspace(msh, ("Discontinuous Lagrange", k))
-VQ = MixedFunctionSpace(V, Q)
+VQ = ufl.MixedFunctionSpace(V, Q)
 
 # Function space for visualising the velocity field
 gdim = msh.geometry.dim
@@ -290,18 +275,18 @@ def f_expr(x):
 
 # Define trial and test functions
 
-u, p = TrialFunctions(VQ)
-v, q = TestFunctions(VQ)
+u, p = ufl.TrialFunctions(VQ)
+v, q = ufl.TestFunctions(VQ)
 
 delta_t = fem.Constant(msh, default_real_type(t_end / num_time_steps))
 alpha = fem.Constant(msh, default_real_type(6.0 * k**2))
 
-h = CellDiameter(msh)
-n = FacetNormal(msh)
+h = ufl.CellDiameter(msh)
+n = ufl.FacetNormal(msh)
 
 
 def jump(phi, n):
-    return outer(phi("+"), n("+")) + outer(phi("-"), n("-"))
+    return ufl.outer(phi("+"), n("+")) + ufl.outer(phi("-"), n("-"))
 
 
 # -
@@ -311,27 +296,28 @@ def jump(phi, n):
 
 # +
 a = (1.0 / Re) * (
-    inner(grad(u), grad(v)) * dx
-    - inner(avg(grad(u)), jump(v, n)) * dS
-    - inner(jump(u, n), avg(grad(v))) * dS
-    + (alpha / avg(h)) * inner(jump(u, n), jump(v, n)) * dS
-    - inner(grad(u), outer(v, n)) * ds
-    - inner(outer(u, n), grad(v)) * ds
-    + (alpha / h) * inner(outer(u, n), outer(v, n)) * ds
+    ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+    - ufl.inner(ufl.avg(ufl.grad(u)), jump(v, n)) * ufl.dS
+    - ufl.inner(jump(u, n), ufl.avg(ufl.grad(v))) * ufl.dS
+    + (alpha / ufl.avg(h)) * ufl.inner(jump(u, n), jump(v, n)) * ufl.dS
+    - ufl.inner(ufl.grad(u), ufl.outer(v, n)) * ufl.ds
+    - ufl.inner(ufl.outer(u, n), ufl.grad(v)) * ufl.ds
+    + (alpha / h) * ufl.inner(ufl.outer(u, n), ufl.outer(v, n)) * ufl.ds
 )
-a -= inner(p, div(v)) * dx
-a -= inner(div(u), q) * dx
+a -= ufl.inner(p, ufl.div(v)) * ufl.dx
+a -= ufl.inner(ufl.div(u), q) * ufl.dx
 
-a_blocked = fem.form(extract_blocks(a))
+a_blocked = fem.form(ufl.extract_blocks(a))
 
 f = fem.Function(W)
 u_D = fem.Function(V)
 u_D.interpolate(u_e_expr)
-L = inner(f, v) * dx + (1 / Re) * (
-    -inner(outer(u_D, n), grad(v)) * ds + (alpha / h) * inner(outer(u_D, n), outer(v, n)) * ds
+L = ufl.inner(f, v) * ufl.dx + (1 / Re) * (
+    -ufl.inner(ufl.outer(u_D, n), ufl.grad(v)) * ufl.ds
+    + (alpha / h) * ufl.inner(ufl.outer(u_D, n), ufl.outer(v, n)) * ufl.ds
 )
-L += inner(fem.Constant(msh, default_real_type(0.0)), q) * dx
-L_blocked = fem.form(extract_blocks(L))
+L += ufl.inner(fem.Constant(msh, default_real_type(0.0)), q) * ufl.dx
+L_blocked = fem.form(ufl.extract_blocks(L))
 
 # Boundary conditions
 boundary_facets = mesh.exterior_facet_indices(msh.topology)
@@ -404,19 +390,22 @@ def jump(phi, n):
 # Now we add the time stepping and convective terms
 
 # +
-lmbda = conditional(gt(dot(u_n, n), 0), 1, 0)
+lmbda = ufl.conditional(ufl.gt(ufl.dot(u_n, n), 0), 1, 0)
 u_uw = lmbda("+") * u("+") + lmbda("-") * u("-")
 a += (
-    inner(u / delta_t, v) * dx
-    - inner(u, div(outer(v, u_n))) * dx
-    + inner((dot(u_n, n))("+") * u_uw, v("+")) * dS
-    + inner((dot(u_n, n))("-") * u_uw, v("-")) * dS
-    + inner(dot(u_n, n) * lmbda * u, v) * ds
+    ufl.inner(u / delta_t, v) * ufl.dx
+    - ufl.inner(u, ufl.div(ufl.outer(v, u_n))) * ufl.dx
+    + ufl.inner((ufl.dot(u_n, n))("+") * u_uw, v("+")) * ufl.dS
+    + ufl.inner((ufl.dot(u_n, n))("-") * u_uw, v("-")) * ufl.dS
+    + ufl.inner(ufl.dot(u_n, n) * lmbda * u, v) * ufl.ds
 )
-a_blocked = fem.form(extract_blocks(a))
+a_blocked = fem.form(ufl.extract_blocks(a))
 
-L += inner(u_n / delta_t, v) * dx - inner(dot(u_n, n) * (1 - lmbda) * u_D, v) * ds
-L_blocked = fem.form(extract_blocks(L))
+L += (
+    ufl.inner(u_n / delta_t, v) * ufl.dx
+    - ufl.inner(ufl.dot(u_n, n) * (1 - lmbda) * u_D, v) * ufl.ds
+)
+L_blocked = fem.form(ufl.extract_blocks(L))
 
 # Time stepping loop
 for n in range(num_time_steps):
@@ -473,7 +462,7 @@ def jump(phi, n):
 
 # Compute errors
 e_u = norm_L2(msh.comm, u_h - u_e)
-e_div_u = norm_L2(msh.comm, div(u_h))
+e_div_u = norm_L2(msh.comm, ufl.div(u_h))
 
 # This scheme conserves mass exactly, so check this
 assert np.isclose(e_div_u, 0.0, atol=float(1.0e5 * np.finfo(default_real_type).eps))
diff --git a/python/demo/demo_poisson.py b/python/demo/demo_poisson.py
index 38c072211e3..0ee822c719d 100644
--- a/python/demo/demo_poisson.py
+++ b/python/demo/demo_poisson.py
@@ -85,7 +85,6 @@
 import ufl
 from dolfinx import fem, io, mesh, plot
 from dolfinx.fem.petsc import LinearProblem
-from ufl import ds, dx, grad, inner
 
 # -
 
@@ -146,8 +145,8 @@
 x = ufl.SpatialCoordinate(msh)
 f = 10 * ufl.exp(-((x[0] - 0.5) ** 2 + (x[1] - 0.5) ** 2) / 0.02)
 g = ufl.sin(5 * x[0])
-a = inner(grad(u), grad(v)) * dx
-L = inner(f, v) * dx + inner(g, v) * ds
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+L = ufl.inner(f, v) * ufl.dx + ufl.inner(g, v) * ufl.ds
 # -
 
 # A {py:class}`LinearProblem <dolfinx.fem.petsc.LinearProblem>` object is
diff --git a/python/demo/demo_poisson_matrix_free.py b/python/demo/demo_poisson_matrix_free.py
index b29d9980403..fa5fe103bdd 100644
--- a/python/demo/demo_poisson_matrix_free.py
+++ b/python/demo/demo_poisson_matrix_free.py
@@ -83,7 +83,6 @@
 import dolfinx
 import ufl
 from dolfinx import fem, la
-from ufl import action, dx, grad, inner
 
 # We begin by using {py:func}`create_rectangle
 # <dolfinx.mesh.create_rectangle>` to create a rectangular
@@ -138,8 +137,8 @@
 u = ufl.TrialFunction(V)
 v = ufl.TestFunction(V)
 f = fem.Constant(mesh, dtype(-6.0))
-a = inner(grad(u), grad(v)) * dx
-L = inner(f, v) * dx
+a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+L = ufl.inner(f, v) * ufl.dx
 L_fem = fem.form(L, dtype=dtype)
 
 # For the matrix-free solvers we also define a second linear form `M` as
@@ -152,7 +151,7 @@
 # $$
 
 ui = fem.Function(V, dtype=dtype)
-M = action(a, ui)
+M = ufl.action(a, ui)
 M_fem = fem.form(M, dtype=dtype)
 
 # ### Matrix-free conjugate gradient solver
@@ -254,7 +253,7 @@ def _global_dot(comm, v0, v1):
 
 
 def L2Norm(u):
-    val = fem.assemble_scalar(fem.form(inner(u, u) * dx, dtype=dtype))
+    val = fem.assemble_scalar(fem.form(ufl.inner(u, u) * ufl.dx, dtype=dtype))
     return np.sqrt(comm.allreduce(val, op=MPI.SUM))
 
 
diff --git a/python/demo/demo_pyamg.py b/python/demo/demo_pyamg.py
index 69de02486fe..dab612f9a1f 100644
--- a/python/demo/demo_pyamg.py
+++ b/python/demo/demo_pyamg.py
@@ -48,7 +48,6 @@
     locate_dofs_topological,
 )
 from dolfinx.mesh import CellType, create_box, locate_entities_boundary
-from ufl import ds, dx, grad, inner
 
 # -
 
@@ -88,8 +87,8 @@ def poisson_problem(dtype: npt.DTypeLike, solver_type: str):
     x = ufl.SpatialCoordinate(mesh)
     f = 10 * ufl.exp(-((x[0] - 0.5) ** 2 + (x[1] - 0.5) ** 2) / 0.02)
     g = ufl.sin(5 * x[0])
-    a = form(inner(grad(u), grad(v)) * dx, dtype=dtype)
-    L = form(inner(f, v) * dx + inner(g, v) * ds, dtype=dtype)
+    a = form(ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx, dtype=dtype)
+    L = form(ufl.inner(f, v) * ufl.dx + ufl.inner(g, v) * ufl.ds, dtype=dtype)
 
     A = assemble_matrix(a, [bc]).to_scipy()
     b = assemble_vector(L)
@@ -193,12 +192,14 @@ def elasticity_problem(dtype):
 
     def σ(v):
         """Return an expression for the stress σ given a displacement field"""
-        return 2.0 * μ * ufl.sym(grad(v)) + λ * ufl.tr(ufl.sym(grad(v))) * ufl.Identity(len(v))
+        return 2.0 * μ * ufl.sym(ufl.grad(v)) + λ * ufl.tr(ufl.sym(ufl.grad(v))) * ufl.Identity(
+            len(v)
+        )
 
     V = functionspace(mesh, ("Lagrange", 1, (mesh.geometry.dim,)))
     u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
-    a = form(inner(σ(u), grad(v)) * dx, dtype=dtype)
-    L = form(inner(f, v) * dx, dtype=dtype)
+    a = form(ufl.inner(σ(u), ufl.grad(v)) * ufl.dx, dtype=dtype)
+    L = form(ufl.inner(f, v) * ufl.dx, dtype=dtype)
 
     tdim = mesh.topology.dim
     dofs = locate_dofs_topological(V=V, entity_dim=tdim - 1, entities=facets)
diff --git a/python/demo/demo_stokes.py b/python/demo/demo_stokes.py
index 599063f4fc5..5b5df67b4c1 100644
--- a/python/demo/demo_stokes.py
+++ b/python/demo/demo_stokes.py
@@ -116,7 +116,6 @@
 from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
 from dolfinx.io import XDMFFile
 from dolfinx.mesh import CellType, create_rectangle, locate_entities_boundary
-from ufl import ZeroBaseForm, div, dx, grad, inner
 
 # We create a {py:class}`Mesh <dolfinx.mesh.Mesh>`, define functions for
 # locating geometrically subsets of the boundary, and define a function
@@ -185,14 +184,19 @@ def lid_velocity_expression(x):
 (v, q) = ufl.TestFunction(V), ufl.TestFunction(Q)
 f = Constant(msh, (PETSc.ScalarType(0), PETSc.ScalarType(0)))  # type: ignore
 
-a = form([[inner(grad(u), grad(v)) * dx, inner(p, div(v)) * dx], [inner(div(u), q) * dx, None]])
-L = form([inner(f, v) * dx, ZeroBaseForm((q,))])
+a = form(
+    [
+        [ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx, ufl.inner(p, ufl.div(v)) * ufl.dx],
+        [ufl.inner(ufl.div(u), q) * ufl.dx, None],
+    ]
+)
+L = form([ufl.inner(f, v) * ufl.dx, ufl.ZeroBaseForm((q,))])
 # -
 
 # A block-diagonal preconditioner will be used with the iterative
 # solvers for this problem:
 
-a_p11 = form(inner(p, q) * dx)
+a_p11 = form(ufl.inner(p, q) * ufl.dx)
 a_p = [[a[0][0], None], [None, a_p11]]
 
 # ### Nested matrix solver
@@ -503,8 +507,11 @@ def mixed_direct():
     (u, p) = ufl.TrialFunctions(W)
     (v, q) = ufl.TestFunctions(W)
     f = Function(Q)
-    a = form((inner(grad(u), grad(v)) + inner(p, div(v)) + inner(div(u), q)) * dx)
-    L = form(inner(f, v) * dx)
+    a = form(
+        (ufl.inner(ufl.grad(u), ufl.grad(v)) + ufl.inner(p, ufl.div(v)) + ufl.inner(ufl.div(u), q))
+        * ufl.dx
+    )
+    L = form(ufl.inner(f, v) * ufl.dx)
 
     # Assemble LHS matrix and RHS vector
     A = fem.petsc.assemble_matrix(a, bcs=bcs)
diff --git a/python/demo/demo_tnt-elements.py b/python/demo/demo_tnt-elements.py
index 06eadf560b6..810d18533a7 100644
--- a/python/demo/demo_tnt-elements.py
+++ b/python/demo/demo_tnt-elements.py
@@ -41,9 +41,9 @@
 
 import basix
 import basix.ufl
+import ufl
 from dolfinx import default_real_type, fem, mesh
 from dolfinx.fem.petsc import LinearProblem
-from ufl import SpatialCoordinate, TestFunction, TrialFunction, cos, div, dx, grad, inner, sin
 
 mpl.use("agg")
 # -
@@ -256,14 +256,14 @@ def create_tnt_quad(degree):
 
 def poisson_error(V: fem.FunctionSpace):
     msh = V.mesh
-    u, v = TrialFunction(V), TestFunction(V)
+    u, v = ufl.TrialFunction(V), ufl.TestFunction(V)
 
-    x = SpatialCoordinate(msh)
-    u_exact = sin(10 * x[1]) * cos(15 * x[0])
-    f = -div(grad(u_exact))
+    x = ufl.SpatialCoordinate(msh)
+    u_exact = ufl.sin(10 * x[1]) * ufl.cos(15 * x[0])
+    f = -ufl.div(ufl.grad(u_exact))
 
-    a = inner(grad(u), grad(v)) * dx
-    L = inner(f, v) * dx
+    a = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx
+    L = ufl.inner(f, v) * ufl.dx
 
     # Create Dirichlet boundary condition
     u_bc = fem.Function(V)
@@ -278,7 +278,7 @@ def poisson_error(V: fem.FunctionSpace):
     problem = LinearProblem(a, L, bcs=[bc], petsc_options={"ksp_rtol": 1e-12})
     uh = problem.solve()
 
-    M = (u_exact - uh) ** 2 * dx
+    M = (u_exact - uh) ** 2 * ufl.dx
     M = fem.form(M)
     error = msh.comm.allreduce(fem.assemble_scalar(M), op=MPI.SUM)
     return error**0.5
diff --git a/python/demo/pyproject.toml b/python/demo/pyproject.toml
new file mode 100644
index 00000000000..28e7749be23
--- /dev/null
+++ b/python/demo/pyproject.toml
@@ -0,0 +1,5 @@
+[tool.ruff]
+extend = "../pyproject.toml"
+
+[tool.ruff.lint.flake8-import-conventions]
+banned-from = ["ufl"]

From 7dee60cc769342bab35c8ee3c4c8a1df02a285be Mon Sep 17 00:00:00 2001
From: Chris Richardson <chris@bpi.cam.ac.uk>
Date: Sat, 11 Jan 2025 19:40:11 +0000
Subject: [PATCH 3/3] Move SpMV from tests into MatrixCSR (#3591)

* Move SPMV to MatrixCSR from test

* ruff

* Add to python

* Template block size

* Put back c++20 in tests

* Rename as Scalar for doxygen

* Try to get BLAS

* Simplifications

* Small updates

---------

Co-authored-by: Garth N. Wells <gnw20@cam.ac.uk>
---
 cpp/dolfinx/la/MatrixCSR.h                    |  83 ++++++++
 cpp/dolfinx/la/matrix_csr_impl.h              |  64 ++++--
 cpp/test/CMakeLists.txt                       |   4 +-
 cpp/test/matrix.cpp                           |  84 +-------
 python/dolfinx/la.py                          | 183 +++++++++---------
 python/dolfinx/wrappers/la.cpp                |   1 +
 .../test/unit/fem/test_discrete_operators.py  |  11 +-
 python/test/unit/la/test_matrix_vector.py     |  32 +++
 8 files changed, 271 insertions(+), 191 deletions(-)

diff --git a/cpp/dolfinx/la/MatrixCSR.h b/cpp/dolfinx/la/MatrixCSR.h
index 1a6738d676a..c15e7248dd5 100644
--- a/cpp/dolfinx/la/MatrixCSR.h
+++ b/cpp/dolfinx/la/MatrixCSR.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include "SparsityPattern.h"
+#include "Vector.h"
 #include "matrix_csr_impl.h"
 #include <algorithm>
 #include <dolfinx/common/IndexMap.h>
@@ -322,6 +323,15 @@ class MatrixCSR
   /// @note MPI Collective
   double squared_norm() const;
 
+  /// @brief Compute the product `y += Ax`.
+  ///
+  /// The vectors `x` and `y` must have parallel layouts that are
+  /// compatible with `A`.
+  ///
+  /// @param[in] x Vector to be apply `A` to.
+  /// @param[in,out] y Vector to accumulate the result into.
+  void mult(Vector<value_type>& x, Vector<value_type>& y);
+
   /// @brief Index maps for the row and column space.
   ///
   /// The row IndexMap contains ghost entries for rows which may be
@@ -734,4 +744,77 @@ double MatrixCSR<U, V, W, X>::squared_norm() const
 }
 //-----------------------------------------------------------------------------
 
+// The matrix A is distributed across P  processes by blocks of rows:
+//  A = |   A_0  |
+//      |   A_1  |
+//      |   ...  |
+//      |  A_P-1 |
+//
+// Each submatrix A_i is owned by a single process "i" and can be further
+// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
+//  Ai = |Ai[0] Ai[1]|
+//
+// If A is square, the diagonal block Ai[0] is also square and contains
+// only owned columns and rows. The block Ai[1] contains ghost columns
+// (unowned dofs).
+
+// Likewise, a local vector x can be decomposed into owned and ghost blocks:
+// xi = |   x[0]  |
+//      |   x[1]  |
+//
+// So the product y = Ax can be computed into two separate steps:
+//  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
+//                       |   x[1]  |
+//
+/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y
+template <typename Scalar, typename V, typename W, typename X>
+void MatrixCSR<Scalar, V, W, X>::mult(la::Vector<Scalar>& x,
+                                      la::Vector<Scalar>& y)
+{
+  // start communication (update ghosts)
+  x.scatter_fwd_begin();
+
+  const std::int32_t nrowslocal = num_owned_rows();
+  std::span<const std::int64_t> Arow_ptr(row_ptr().data(), nrowslocal + 1);
+  std::span<const std::int32_t> Acols(cols().data(), Arow_ptr[nrowslocal]);
+  std::span<const std::int64_t> Aoff_diag_offset(off_diag_offset().data(),
+                                                 nrowslocal);
+  std::span<const Scalar> Avalues(values().data(), Arow_ptr[nrowslocal]);
+
+  std::span<const Scalar> _x = x.array();
+  std::span<Scalar> _y = y.mutable_array();
+
+  std::span<const std::int64_t> Arow_begin(Arow_ptr.data(), nrowslocal);
+  std::span<const std::int64_t> Arow_end(Arow_ptr.data() + 1, nrowslocal);
+
+  // First stage:  spmv - diagonal
+  // yi[0] += Ai[0] * xi[0]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Arow_begin, Aoff_diag_offset, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
+
+  // finalize ghost update
+  x.scatter_fwd_end();
+
+  // Second stage:  spmv - off-diagonal
+  // yi[0] += Ai[1] * xi[1]
+  if (_bs[1] == 1)
+  {
+    impl::spmv<Scalar, 1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                          _bs[0], 1);
+  }
+  else
+  {
+    impl::spmv<Scalar, -1>(Avalues, Aoff_diag_offset, Arow_end, Acols, _x, _y,
+                           _bs[0], _bs[1]);
+  }
+}
+
 } // namespace dolfinx::la
diff --git a/cpp/dolfinx/la/matrix_csr_impl.h b/cpp/dolfinx/la/matrix_csr_impl.h
index 4da90cb046f..8a218a130e4 100644
--- a/cpp/dolfinx/la/matrix_csr_impl.h
+++ b/cpp/dolfinx/la/matrix_csr_impl.h
@@ -100,14 +100,12 @@ void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
                            const X& x, const Y& xrows, const Y& xcols, OP op,
                            typename Y::value_type num_rows, int bs0, int bs1);
 
-} // namespace impl
-
 //-----------------------------------------------------------------------------
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
-void impl::insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
-                      const Y& xrows, const Y& xcols, OP op,
-                      [[maybe_unused]] typename Y::value_type num_rows)
+void insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                const Y& xrows, const Y& xcols, OP op,
+                [[maybe_unused]] typename Y::value_type num_rows)
 {
   const std::size_t nc = xcols.size();
   assert(x.size() == xrows.size() * xcols.size() * BS0 * BS1);
@@ -151,9 +149,9 @@ void impl::insert_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
 // Insert with block insertion into a regular CSR (block size 1)
 template <int BS0, int BS1, typename OP, typename U, typename V, typename W,
           typename X, typename Y>
-void impl::insert_blocked_csr(U&& data, const V& cols, const W& row_ptr,
-                              const X& x, const Y& xrows, const Y& xcols, OP op,
-                              [[maybe_unused]] typename Y::value_type num_rows)
+void insert_blocked_csr(U&& data, const V& cols, const W& row_ptr, const X& x,
+                        const Y& xrows, const Y& xcols, OP op,
+                        [[maybe_unused]] typename Y::value_type num_rows)
 {
   const std::size_t nc = xcols.size();
   assert(x.size() == xrows.size() * xcols.size() * BS0 * BS1);
@@ -195,12 +193,10 @@ void impl::insert_blocked_csr(U&& data, const V& cols, const W& row_ptr,
 // Add individual entries in block-CSR storage
 template <typename OP, typename U, typename V, typename W, typename X,
           typename Y>
-void impl::insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
-                                 const X& x, const Y& xrows, const Y& xcols,
-                                 OP op,
-                                 [[maybe_unused]]
-                                 typename Y::value_type num_rows,
-                                 int bs0, int bs1)
+void insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
+                           const X& x, const Y& xrows, const Y& xcols, OP op,
+                           [[maybe_unused]] typename Y::value_type num_rows,
+                           int bs0, int bs1)
 {
   const std::size_t nc = xcols.size();
   const int nbs = bs0 * bs1;
@@ -237,4 +233,44 @@ void impl::insert_nonblocked_csr(U&& data, const V& cols, const W& row_ptr,
   }
 }
 //-----------------------------------------------------------------------------
+
+template <typename T, int BS1>
+void spmv(std::span<const T> values, std::span<const std::int64_t> row_begin,
+          std::span<const std::int64_t> row_end,
+          std::span<const std::int32_t> indices, std::span<const T> x,
+          std::span<T> y, int bs0, int bs1)
+{
+  assert(row_begin.size() == row_end.size());
+
+  for (int k0 = 0; k0 < bs0; ++k0)
+  {
+    for (std::size_t i = 0; i < row_begin.size(); i++)
+    {
+      T vi{0};
+      for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
+      {
+        if constexpr (BS1 == -1)
+        {
+          for (int k1 = 0; k1 < bs1; ++k1)
+          {
+            vi += values[j * bs1 * bs0 + k1 * bs0 + k0]
+                  * x[indices[j] * bs1 + k1];
+          }
+        }
+        else
+        {
+          for (int k1 = 0; k1 < BS1; ++k1)
+          {
+            vi += values[j * BS1 * bs0 + k1 * bs0 + k0]
+                  * x[indices[j] * BS1 + k1];
+          }
+        }
+      }
+
+      y[i * bs0 + k0] += vi;
+    }
+  }
+}
+
+} // namespace impl
 } // namespace dolfinx::la
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index ada545792a1..e5b9a8a37ab 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -51,11 +51,13 @@ add_executable(
   ${CMAKE_CURRENT_BINARY_DIR}/poisson.c
 )
 target_link_libraries(unittests PRIVATE Catch2::Catch2WithMain dolfinx)
+
 # UUID requires bcrypt to be linked on Windows, broken in vcpkg.
 # https://github.com/microsoft/vcpkg/issues/4481
 if(WIN32)
   target_link_libraries(unittests PRIVATE bcrypt)
 endif()
+
 target_include_directories(
   unittests PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
 )
@@ -71,7 +73,7 @@ endif()
 target_compile_options(
   unittests
   PRIVATE
-    $<$<AND:$<CONFIG:Developer>,$<COMPILE_LANGUAGE:CXX>>:${DOLFINX_CXX_DEVELOPER_FLAGS}>
+  $<$<AND:$<CONFIG:Developer>,$<COMPILE_LANGUAGE:CXX>>:${DOLFINX_CXX_DEVELOPER_FLAGS}>
 )
 
 # Enable testing
diff --git a/cpp/test/matrix.cpp b/cpp/test/matrix.cpp
index 9741111964c..ce0f3644fff 100644
--- a/cpp/test/matrix.cpp
+++ b/cpp/test/matrix.cpp
@@ -23,88 +23,6 @@ using namespace dolfinx;
 
 namespace
 {
-/// Computes y += A*x for a local CSR matrix A and local dense vectors x,y
-/// @param[in] values Nonzero values of A
-/// @param[in] row_begin First index of each row in the arrays values and
-/// indices.
-/// @param[in] row_end Last index of each row in the arrays values and indices.
-/// @param[in] indices Column indices for each non-zero element of the matrix A
-/// @param[in] x Input vector
-/// @param[in, out] x Output vector
-template <typename T>
-void spmv_impl(std::span<const T> values,
-               std::span<const std::int64_t> row_begin,
-               std::span<const std::int64_t> row_end,
-               std::span<const std::int32_t> indices, std::span<const T> x,
-               std::span<T> y)
-{
-  assert(row_begin.size() == row_end.size());
-  for (std::size_t i = 0; i < row_begin.size(); i++)
-  {
-    double vi{0};
-    for (std::int32_t j = row_begin[i]; j < row_end[i]; j++)
-      vi += values[j] * x[indices[j]];
-    y[i] += vi;
-  }
-}
-
-// The matrix A is distributed across P  processes by blocks of rows:
-//  A = |   A_0  |
-//      |   A_1  |
-//      |   ...  |
-//      |  A_P-1 |
-//
-// Each submatrix A_i is owned by a single process "i" and can be further
-// decomposed into diagonal (Ai[0]) and off diagonal (Ai[1]) blocks:
-//  Ai = |Ai[0] Ai[1]|
-//
-// If A is square, the diagonal block Ai[0] is also square and contains
-// only owned columns and rows. The block Ai[1] contains ghost columns
-// (unowned dofs).
-
-// Likewise, a local vector x can be decomposed into owned and ghost blocks:
-// xi = |   x[0]  |
-//      |   x[1]  |
-//
-// So the product y = Ax can be computed into two separate steps:
-//  y[0] = |Ai[0] Ai[1]| |   x[0]  | = Ai[0] x[0] + Ai[1] x[1]
-//                       |   x[1]  |
-//
-/// Computes y += A*x for a parallel CSR matrix A and parallel dense vectors x,y
-/// @param[in] A Parallel CSR matrix
-/// @param[in] x Input vector
-/// @param[in, out] y Output vector
-template <typename T>
-void spmv(la::MatrixCSR<T>& A, la::Vector<T>& x, la::Vector<T>& y)
-{
-  // start communication (update ghosts)
-  x.scatter_fwd_begin();
-
-  const std::int32_t nrowslocal = A.num_owned_rows();
-  std::span<const std::int64_t> row_ptr(A.row_ptr().data(), nrowslocal + 1);
-  std::span<const std::int32_t> cols(A.cols().data(), row_ptr[nrowslocal]);
-  std::span<const std::int64_t> off_diag_offset(A.off_diag_offset().data(),
-                                                nrowslocal);
-  std::span<const T> values(A.values().data(), row_ptr[nrowslocal]);
-
-  std::span<const T> _x = x.array();
-  std::span<T> _y = y.mutable_array();
-
-  std::span<const std::int64_t> row_begin(row_ptr.data(), nrowslocal);
-  std::span<const std::int64_t> row_end(row_ptr.data() + 1, nrowslocal);
-
-  // First stage:  spmv - diagonal
-  // yi[0] += Ai[0] * xi[0]
-  spmv_impl<T>(values, row_begin, off_diag_offset, cols, _x, _y);
-
-  // finalize ghost update
-  x.scatter_fwd_end();
-
-  // Second stage:  spmv - off-diagonal
-  // yi[0] += Ai[1] * xi[1]
-  spmv_impl<T>(values, off_diag_offset, row_end, cols, _x, _y);
-}
-
 /// @brief Create a matrix operator
 /// @param comm The communicator to builf the matrix on
 /// @return The assembled matrix
@@ -195,7 +113,7 @@ la::MatrixCSR<double> create_operator(MPI_Comm comm)
 
   // Matrix A represents the action of the Laplace operator, so when
   // applied to a constant vector the result should be zero
-  spmv(A, x, y);
+  A.mult(x, y);
 
   std::ranges::for_each(y.array(),
                         [](auto a) { REQUIRE(std::abs(a) < 1e-13); });
diff --git a/python/dolfinx/la.py b/python/dolfinx/la.py
index 49ce24a6a5a..23c5fba198c 100644
--- a/python/dolfinx/la.py
+++ b/python/dolfinx/la.py
@@ -27,6 +27,90 @@
 ]
 
 
+class Vector:
+    _cpp_object: typing.Union[
+        _cpp.la.Vector_float32,
+        _cpp.la.Vector_float64,
+        _cpp.la.Vector_complex64,
+        _cpp.la.Vector_complex128,
+        _cpp.la.Vector_int8,
+        _cpp.la.Vector_int32,
+        _cpp.la.Vector_int64,
+    ]
+
+    def __init__(
+        self,
+        x: typing.Union[
+            _cpp.la.Vector_float32,
+            _cpp.la.Vector_float64,
+            _cpp.la.Vector_complex64,
+            _cpp.la.Vector_complex128,
+            _cpp.la.Vector_int8,
+            _cpp.la.Vector_int32,
+            _cpp.la.Vector_int64,
+        ],
+    ):
+        """A distributed vector object.
+
+        Args:
+            x: C++ Vector object.
+
+        Note:
+            This initialiser is intended for internal library use only.
+            User code should call :func:`vector` to create a vector object.
+        """
+        self._cpp_object = x
+        self._petsc_x = None
+
+    def __del__(self):
+        if self._petsc_x is not None:
+            self._petsc_x.destroy()
+
+    @property
+    def index_map(self) -> IndexMap:
+        """Index map that describes size and parallel distribution."""
+        return self._cpp_object.index_map
+
+    @property
+    def block_size(self) -> int:
+        """Block size for the vector."""
+        return self._cpp_object.bs
+
+    @property
+    def array(self) -> np.ndarray:
+        """Local representation of the vector."""
+        return self._cpp_object.array
+
+    @property
+    def petsc_vec(self):
+        """PETSc vector holding the entries of the vector.
+
+        Upon first call, this function creates a PETSc ``Vec`` object
+        that wraps the degree-of-freedom data. The ``Vec`` object is
+        cached and the cached ``Vec`` is returned upon subsequent calls.
+
+        Note:
+          When the object is destroyed it will destroy the underlying petsc4py
+          vector automatically.
+        """
+        if self._petsc_x is None:
+            self._petsc_x = create_petsc_vector_wrap(self)
+        return self._petsc_x
+
+    def scatter_forward(self) -> None:
+        """Update ghost entries."""
+        self._cpp_object.scatter_forward()
+
+    def scatter_reverse(self, mode: InsertMode) -> None:
+        """Scatter ghost entries to owner.
+
+        Args:
+            mode: Control how scattered values are set/accumulated by
+                owner.
+        """
+        self._cpp_object.scatter_reverse(mode)
+
+
 class MatrixCSR:
     _cpp_object: typing.Union[
         _cpp.la.MatrixCSR_float32,
@@ -63,8 +147,17 @@ def index_map(self, i: int) -> IndexMap:
         """
         return self._cpp_object.index_map(i)
 
+    def mult(self, x: Vector, y: Vector) -> None:
+        """Compute ``y += Ax``.
+
+        Args:
+            x: Input Vector
+            y: Output Vector
+        """
+        self._cpp_object.mult(x._cpp_object, y._cpp_object)
+
     @property
-    def block_size(self):
+    def block_size(self) -> list:
         """Block sizes for the matrix."""
         return self._cpp_object.bs
 
@@ -128,11 +221,10 @@ def to_dense(self) -> npt.NDArray[np.floating]:
 
         Note:
             Typically used for debugging.
-
         """
         return self._cpp_object.to_dense()
 
-    def to_scipy(self, ghosted=False):
+    def to_scipy(self, ghosted: bool = False):
         """Convert to a SciPy CSR/BSR matrix. Data is shared.
 
         Note:
@@ -202,90 +294,6 @@ def matrix_csr(
     return MatrixCSR(ftype(sp, block_mode))
 
 
-class Vector:
-    _cpp_object: typing.Union[
-        _cpp.la.Vector_float32,
-        _cpp.la.Vector_float64,
-        _cpp.la.Vector_complex64,
-        _cpp.la.Vector_complex128,
-        _cpp.la.Vector_int8,
-        _cpp.la.Vector_int32,
-        _cpp.la.Vector_int64,
-    ]
-
-    def __init__(
-        self,
-        x: typing.Union[
-            _cpp.la.Vector_float32,
-            _cpp.la.Vector_float64,
-            _cpp.la.Vector_complex64,
-            _cpp.la.Vector_complex128,
-            _cpp.la.Vector_int8,
-            _cpp.la.Vector_int32,
-            _cpp.la.Vector_int64,
-        ],
-    ):
-        """A distributed vector object.
-
-        Args:
-            x: C++ Vector object.
-
-        Note:
-            This initialiser is intended for internal library use only.
-            User code should call :func:`vector` to create a vector object.
-        """
-        self._cpp_object = x
-        self._petsc_x = None
-
-    def __del__(self):
-        if self._petsc_x is not None:
-            self._petsc_x.destroy()
-
-    @property
-    def index_map(self) -> IndexMap:
-        """Index map that describes size and parallel distribution."""
-        return self._cpp_object.index_map
-
-    @property
-    def block_size(self) -> int:
-        """Block size for the vector."""
-        return self._cpp_object.bs
-
-    @property
-    def array(self) -> np.ndarray:
-        """Local representation of the vector."""
-        return self._cpp_object.array
-
-    @property
-    def petsc_vec(self):
-        """PETSc vector holding the entries of the vector.
-
-        Upon first call, this function creates a PETSc ``Vec`` object
-        that wraps the degree-of-freedom data. The ``Vec`` object is
-        cached and the cached ``Vec`` is returned upon subsequent calls.
-
-        Note:
-          When the object is destroyed it will destroy the underlying petsc4py
-          vector automatically.
-        """
-        if self._petsc_x is None:
-            self._petsc_x = create_petsc_vector_wrap(self)
-        return self._petsc_x
-
-    def scatter_forward(self) -> None:
-        """Update ghost entries."""
-        self._cpp_object.scatter_forward()
-
-    def scatter_reverse(self, mode: InsertMode) -> None:
-        """Scatter ghost entries to owner.
-
-        Args:
-            mode: Control how scattered values are set/accumulated by
-                owner.
-        """
-        self._cpp_object.scatter_reverse(mode)
-
-
 def vector(map, bs=1, dtype: npt.DTypeLike = np.float64) -> Vector:
     """Create a distributed vector.
 
@@ -332,7 +340,6 @@ def create_petsc_vector_wrap(x: Vector):
 
     Returns:
         A PETSc vector that shares data with ``x``.
-
     """
     from petsc4py import PETSc
 
diff --git a/python/dolfinx/wrappers/la.cpp b/python/dolfinx/wrappers/la.cpp
index 35fece54300..b04209c277b 100644
--- a/python/dolfinx/wrappers/la.cpp
+++ b/python/dolfinx/wrappers/la.cpp
@@ -132,6 +132,7 @@ void declare_objects(nb::module_& m, const std::string& type)
                &dolfinx::la::MatrixCSR<T>::set),
            nb::arg("x"))
       .def("scatter_reverse", &dolfinx::la::MatrixCSR<T>::scatter_rev)
+      .def("mult", &dolfinx::la::MatrixCSR<T>::mult)
       .def("to_dense",
            [](const dolfinx::la::MatrixCSR<T>& self)
            {
diff --git a/python/test/unit/fem/test_discrete_operators.py b/python/test/unit/fem/test_discrete_operators.py
index 16f7283f6ce..eab5d82dd26 100644
--- a/python/test/unit/fem/test_discrete_operators.py
+++ b/python/test/unit/fem/test_discrete_operators.py
@@ -233,9 +233,10 @@ def test_interpolation_matrix(cell_type, p, q, from_lagrange):
 
     V = functionspace(mesh, el0)
     W = functionspace(mesh, el1)
-    G = interpolation_matrix(V, W).to_scipy()
+    G = interpolation_matrix(V, W)
 
-    u = Function(V)
+    uvec = dolfinx.la.vector(G.index_map(1), bs=G.block_size[1])
+    u = Function(V, uvec)
 
     def f(x):
         if mesh.geometry.dim == 2:
@@ -249,10 +250,10 @@ def f(x):
 
     # Compute global matrix vector product
     w = Function(W)
-    ux = np.zeros(G.shape[1])
-    ux[: len(u.x.array)] = u.x.array[:]
-    w.x.array[: G.shape[0]] = G @ ux
+    w.x.array[:] = 0
+    G.mult(u.x, w.x)
     w.x.scatter_forward()
 
     atol = 100 * np.finfo(default_real_type).resolution
+
     assert np.allclose(w_vec.x.array, w.x.array, atol=atol)
diff --git a/python/test/unit/la/test_matrix_vector.py b/python/test/unit/la/test_matrix_vector.py
index 7114d3af70a..c917d2a8ffe 100644
--- a/python/test/unit/la/test_matrix_vector.py
+++ b/python/test/unit/la/test_matrix_vector.py
@@ -44,6 +44,38 @@ def test_create_matrix_csr():
     assert np.allclose(dense, np.zeros(dense.shape, dtype=np.complex128))
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float32,
+        np.float64,
+        np.complex64,
+        np.complex128,
+    ],
+)
+def test_matvec(dtype):
+    mesh = create_unit_square(MPI.COMM_WORLD, 3, 3)
+    imap = mesh.topology.index_map(0)
+    sp = _cpp.la.SparsityPattern(mesh.comm, [imap, imap], [1, 1])
+    rows = np.arange(0, imap.size_local)
+    cols = np.arange(0, imap.size_local)
+    sp.insert(rows, cols)
+    sp.finalize()
+
+    # Identity
+    A = la.matrix_csr(sp, dtype=dtype)
+    for i in range(imap.size_local):
+        A.add([2.0], [i], [i])
+    A.scatter_reverse()
+
+    b = la.vector(imap, dtype=dtype)
+    u = la.vector(imap, dtype=dtype)
+    b.array[:] = 1.0
+    A.mult(b, u)
+    u.scatter_forward()
+    assert np.allclose(u.array[: imap.size_local], 2.0)
+
+
 @pytest.mark.parametrize(
     "dtype",
     [