From 98ac2f75419e9f681a6325e36acc8350bb6f80b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Januszewski?= <michalj@gmail.com>
Date: Thu, 20 Dec 2012 14:04:31 +0100
Subject: [PATCH] Split Shan-Chen and Free-Energy code into two kernels, one
 per lattice.

---
 examples/sc_drop.py                       |   2 +-
 examples/sc_phase_separation.py           |   2 +-
 sailfish/backend_cuda.py                  |   1 +
 sailfish/lb_base.py                       |  13 +-
 sailfish/lb_binary.py                     | 255 ++++++++++++-----
 sailfish/lb_single.py                     |  28 +-
 sailfish/subdomain_runner.py              |  33 +--
 sailfish/sym_force.py                     |   1 -
 sailfish/templates/binary_fluid.mako      | 324 +++++++++++-----------
 sailfish/templates/binary_shan_chen.mako  | 121 ++++++++
 sailfish/templates/boundary.mako          |   5 +-
 sailfish/templates/kernel_common.mako     |  33 ++-
 sailfish/templates/relaxation.mako        | 161 +++++------
 sailfish/templates/relaxation_common.mako | 116 ++++----
 sailfish/templates/shan_chen.mako         | 156 ++---------
 sailfish/templates/single_fluid.mako      |  38 +--
 16 files changed, 702 insertions(+), 587 deletions(-)
 create mode 100644 sailfish/templates/binary_shan_chen.mako

diff --git a/examples/sc_drop.py b/examples/sc_drop.py
index b9dd2833..26b9cb3c 100755
--- a/examples/sc_drop.py
+++ b/examples/sc_drop.py
@@ -23,7 +23,7 @@ def update_defaults(cls, defaults):
         defaults.update({
             'lat_nx': 256,
             'lat_ny': 256,
-            'G': 5.0,
+            'G': -5.0,
             'visc': 1.0 / 6.0,
             'periodic_x': True,
             'periodic_y': True,
diff --git a/examples/sc_phase_separation.py b/examples/sc_phase_separation.py
index b4735893..9ecbab69 100755
--- a/examples/sc_phase_separation.py
+++ b/examples/sc_phase_separation.py
@@ -24,7 +24,7 @@ def update_defaults(cls, defaults):
         defaults.update({
             'lat_nx': 256,
             'lat_ny': 256,
-            'G': 5.0,
+            'G': -5.0,
             'visc': 1.0 / 6.0,
             'periodic_x': True,
             'periodic_y': True,
diff --git a/sailfish/backend_cuda.py b/sailfish/backend_cuda.py
index d9be644c..27f0b6b3 100644
--- a/sailfish/backend_cuda.py
+++ b/sailfish/backend_cuda.py
@@ -199,6 +199,7 @@ def get_kernel(self, prog, name, block, args, args_format, shared=0,
 
         if needs_iteration:
             args_format += 'i'
+            args = list(args)
             args.append(0)
             self._iteration_kernels.append(kern)
 
diff --git a/sailfish/lb_base.py b/sailfish/lb_base.py
index 0cbabe47..d8051cfa 100644
--- a/sailfish/lb_base.py
+++ b/sailfish/lb_base.py
@@ -13,6 +13,7 @@
 
 FieldPair = namedtuple('FieldPair', 'abstract buffer')
 ForcePair = namedtuple('ForcePair', 'numeric symbolic')
+KernelPair = namedtuple('KernelPair', 'primary secondary')
 
 class LBSim(object):
     """Describes a specific type of a lattice Boltzmann simulation."""
@@ -173,7 +174,14 @@ def after_step(self, runner):
         pass
 
     def get_compute_kernels(self, runner, full_output, bulk):
-        return []
+        """
+        :param runner: SubdomainRunner object
+        :param full_output: if True, returns kernels that prepare fields for
+                visualization or saving into a file
+        :param bulk: if True, returns kernels that process the bulk domain,
+                otherwise returns kernels that process the subdomain boundary
+        """
+        return KernelPair(None, None)
 
     def get_pbc_kernels(self, runner):
         return []
@@ -258,6 +266,9 @@ def use_force_for_equilibrium(self, force_grid, target_grid):
         To disable acceleration on a grid, pass an invalid grid ID in force_grid
         (e.g. None or -1).
 
+        Note: this is currently only supported in the free-energy MRT model.
+        The force reassignment will be silently ignored in other models.
+
         :param force_grid: grid ID from which the acceleration will be used
         :param target_grid: grid ID on which the acceleration will act
         """
diff --git a/sailfish/lb_binary.py b/sailfish/lb_binary.py
index e4c5bd32..23380edf 100644
--- a/sailfish/lb_binary.py
+++ b/sailfish/lb_binary.py
@@ -8,7 +8,7 @@
 from functools import partial
 import numpy as np
 from sailfish import subdomain_runner, sym, sym_equilibrium
-from sailfish.lb_base import LBSim, LBForcedSim, ScalarField, VectorField
+from sailfish.lb_base import LBSim, LBForcedSim, ScalarField, VectorField, KernelPair
 from sailfish.lb_single import MacroKernels
 
 
@@ -92,74 +92,6 @@ def get_pbc_kernels(self, runner):
         ret = MacroKernels(macro=macro_kernels, distributions=dist_kernels)
         return ret
 
-    def get_compute_kernels(self, runner, full_output, bulk):
-        gpu_rho = runner.gpu_field(self.rho)
-        gpu_phi = runner.gpu_field(self.phi)
-        gpu_v = runner.gpu_field(self.v)
-        gpu_map = runner.gpu_geo_map()
-
-        gpu_dist1a = runner.gpu_dist(0, 0)
-        gpu_dist1b = runner.gpu_dist(0, 1)
-        gpu_dist2a = runner.gpu_dist(1, 0)
-        gpu_dist2b = runner.gpu_dist(1, 1)
-
-        options = 0
-        if full_output:
-            options |= 1
-        if bulk:
-            options |= 2
-
-        options = np.uint32(options)
-        args1 = [gpu_map, gpu_dist1a, gpu_dist1b, gpu_dist2a, gpu_dist2b,
-                gpu_rho, gpu_phi] + gpu_v + [options]
-        args2 = [gpu_map, gpu_dist1b, gpu_dist1a, gpu_dist2b, gpu_dist2a,
-                gpu_rho, gpu_phi] + gpu_v + [options]
-
-        macro_args1 = [gpu_map, gpu_dist1a, gpu_dist2a, gpu_rho, gpu_phi,
-                options]
-        macro_args2 = [gpu_map, gpu_dist1b, gpu_dist2b, gpu_rho, gpu_phi,
-                options]
-
-        args_signature = 'P' * (len(args1) - 1) + 'i'
-        macro_signature = 'P' * (len(macro_args1) - 1) + 'i'
-
-        if runner.gpu_scratch_space is not None:
-            macro_args1.append(runner.gpu_scratch_space)
-            macro_args2.append(runner.gpu_scratch_space)
-            macro_signature += 'P'
-
-            args1.append(runner.gpu_scratch_space)
-            args2.append(runner.gpu_scratch_space)
-            args_signature += 'P'
-
-        macro_kernels = [
-            runner.get_kernel('PrepareMacroFields', macro_args1,
-                macro_signature,
-                needs_iteration=self.config.needs_iteration_num)]
-
-        if self.config.access_pattern == 'AB':
-            macro_kernels.append(
-                runner.get_kernel('PrepareMacroFields', macro_args2,
-                    macro_signature,
-                    needs_iteration=self.config.needs_iteration_num))
-        else:
-            macro_kernels.append(macro_kernels[-1])
-
-        sim_kernels = [
-            runner.get_kernel('CollideAndPropagate', args1,
-                args_signature,
-                needs_iteration=self.config.needs_iteration_num)]
-
-        if self.config.access_pattern == 'AB':
-            sim_kernels.append(
-                runner.get_kernel('CollideAndPropagate', args2,
-                    args_signature,
-                    needs_iteration=self.config.needs_iteration_num))
-        else:
-            sim_kernels.append(sim_kernels[-1])
-
-        return zip(macro_kernels, sim_kernels)
-
     def initial_conditions(self, runner):
         gpu_rho = runner.gpu_field(self.rho)
         gpu_phi = runner.gpu_field(self.phi)
@@ -203,7 +135,8 @@ def constants(self):
 
     @classmethod
     def fields(cls):
-        return [ScalarField('rho'), ScalarField('phi', need_nn=True), VectorField('v')]
+        return [ScalarField('rho'), ScalarField('phi', need_nn=True),
+                VectorField('v'), ScalarField('phi_laplacian')]
 
     @classmethod
     def add_options(cls, group, dim):
@@ -318,6 +251,101 @@ def _prepare_symbols(self):
                     self.S.wxx.append(-Rational(1, 24))
                     self.S.wyy.append(-Rational(1, 24))
 
+    def get_compute_kernels(self, runner, full_output, bulk):
+        gpu_rho = runner.gpu_field(self.rho)
+        gpu_phi = runner.gpu_field(self.phi)
+        gpu_lap = runner.gpu_field(self.phi_laplacian)
+        gpu_v = runner.gpu_field(self.v)
+        gpu_map = runner.gpu_geo_map()
+
+        gpu_dist1a = runner.gpu_dist(0, 0)
+        gpu_dist1b = runner.gpu_dist(0, 1)
+        gpu_dist2a = runner.gpu_dist(1, 0)
+        gpu_dist2b = runner.gpu_dist(1, 1)
+
+        options = 0
+        if full_output:
+            options |= 1
+        if bulk:
+            options |= 2
+
+        if hasattr(self, '_force_term_for_eq') and self._force_term_for_eq.get(1) == 0:
+            phi_args = [gpu_rho, gpu_phi]
+        else:
+            phi_args = [gpu_phi]
+
+        options = np.uint32(options)
+        # Primary.
+        args1a = ([gpu_map, gpu_dist1a, gpu_dist1b, gpu_rho, gpu_phi] +
+                  gpu_v + [gpu_lap, options])
+        args1b = ([gpu_map, gpu_dist2a, gpu_dist2b] + phi_args +
+                  gpu_v + [gpu_lap, options])
+        # Secondary.
+        args2a = ([gpu_map, gpu_dist1b, gpu_dist1a, gpu_rho, gpu_phi] +
+                  gpu_v + [gpu_lap, options])
+        args2b = ([gpu_map, gpu_dist2b, gpu_dist2a] + phi_args +
+                  gpu_v + [gpu_lap, options])
+
+        macro_args1 = [gpu_map, gpu_dist1a, gpu_dist2a, gpu_rho, gpu_phi,
+                       options]
+        macro_args2 = [gpu_map, gpu_dist1b, gpu_dist2b, gpu_rho, gpu_phi,
+                       options]
+
+        args_a_signature = 'P' * (len(args1a) - 1) + 'i'
+        args_b_signature = 'P' * (len(args1b) - 1) + 'i'
+        macro_signature = 'P' * (len(macro_args1) - 1) + 'i'
+
+        if runner.gpu_scratch_space is not None:
+            macro_args1.append(runner.gpu_scratch_space)
+            macro_args2.append(runner.gpu_scratch_space)
+            macro_signature += 'P'
+
+            args1a.append(runner.gpu_scratch_space)
+            args2a.append(runner.gpu_scratch_space)
+            args1b.append(runner.gpu_scratch_space)
+            args2b.append(runner.gpu_scratch_space)
+            args_a_signature += 'P'
+            args_b_signature += 'P'
+
+        macro = runner.get_kernel('FreeEnergyPrepareMacroFields', macro_args1,
+                                  macro_signature,
+                                  needs_iteration=self.config.needs_iteration_num)
+
+        if self.config.access_pattern == 'AB':
+            macro_secondary = runner.get_kernel('FreeEnergyPrepareMacroFields',
+                                                macro_args2,
+                                                macro_signature,
+                                                needs_iteration=self.config.needs_iteration_num)
+            macro_pair = KernelPair(macro, macro_secondary)
+        else:
+            macro_pair = KernelPair(macro, macro)
+
+        # Note: these two kernels need to be executed in order.
+        primary = [
+            runner.get_kernel('FreeEnergyCollideAndPropagateFluid', args1a,
+                              args_a_signature,
+                              needs_iteration=self.config.needs_iteration_num),
+            runner.get_kernel('FreeEnergyCollideAndPropagateOrderParam', args1b,
+                              args_b_signature,
+                              needs_iteration=self.config.needs_iteration_num)
+        ]
+
+        if self.config.access_pattern == 'AB':
+            secondary = [
+                runner.get_kernel('FreeEnergyCollideAndPropagateFluid', args2a,
+                                  args_a_signature,
+                                  needs_iteration=self.config.needs_iteration_num),
+                runner.get_kernel('FreeEnergyCollideAndPropagateOrderParam',
+                                  args2b,
+                                  args_b_signature,
+                                  needs_iteration=self.config.needs_iteration_num)
+            ]
+            sim_pair = KernelPair(primary, secondary)
+        else:
+            sim_pair = KernelPair(primary, primary)
+
+        return zip(macro_pair, sim_pair)
+
 
 class LBBinaryFluidShanChen(LBBinaryFluidBase, LBForcedSim):
     """Binary fluid mixture using the Shan-Chen model."""
@@ -359,3 +387,90 @@ def update_context(self, ctx):
         ctx['sc_potential'] = self.config.sc_potential
         ctx['tau'] = sym.relaxation_time(self.config.visc)
         ctx['visc'] = self.config.visc
+
+    def get_compute_kernels(self, runner, full_output, bulk):
+        gpu_rho = runner.gpu_field(self.rho)
+        gpu_phi = runner.gpu_field(self.phi)
+        gpu_v = runner.gpu_field(self.v)
+        gpu_map = runner.gpu_geo_map()
+
+        gpu_dist1a = runner.gpu_dist(0, 0)
+        gpu_dist1b = runner.gpu_dist(0, 1)
+        gpu_dist2a = runner.gpu_dist(1, 0)
+        gpu_dist2b = runner.gpu_dist(1, 1)
+
+        options = 0
+        if full_output:
+            options |= 1
+        if bulk:
+            options |= 2
+
+        options = np.uint32(options)
+        # Primary.
+        args1a = ([gpu_map, gpu_dist1a, gpu_dist1b, gpu_rho, gpu_phi] +
+                  gpu_v + [options])
+        args1b = ([gpu_map, gpu_dist2a, gpu_dist2b, gpu_rho, gpu_phi] +
+                  gpu_v + [options])
+        # Secondary.
+        args2a = ([gpu_map, gpu_dist1b, gpu_dist1a, gpu_rho, gpu_phi] +
+                  gpu_v + [options])
+        args2b = ([gpu_map, gpu_dist2b, gpu_dist2a, gpu_rho, gpu_phi] +
+                  gpu_v + [options])
+
+        macro_args1 = ([gpu_map, gpu_dist1a, gpu_dist2a, gpu_rho, gpu_phi] +
+                       gpu_v + [options])
+        macro_args2 = ([gpu_map, gpu_dist1b, gpu_dist2b, gpu_rho, gpu_phi] +
+                       gpu_v + [options])
+
+        args_a_signature = 'P' * (len(args1a) - 1) + 'i'
+        args_b_signature = 'P' * (len(args1b) - 1) + 'i'
+        macro_signature = 'P' * (len(macro_args1) - 1) + 'i'
+
+        if runner.gpu_scratch_space is not None:
+            macro_args1.append(runner.gpu_scratch_space)
+            macro_args2.append(runner.gpu_scratch_space)
+            macro_signature += 'P'
+
+            args1a.append(runner.gpu_scratch_space)
+            args2a.append(runner.gpu_scratch_space)
+            args1b.append(runner.gpu_scratch_space)
+            args2b.append(runner.gpu_scratch_space)
+            args_a_signature += 'P'
+            args_b_signature += 'P'
+
+        macro = runner.get_kernel('ShanChenPrepareMacroFields', macro_args1,
+                                  macro_signature,
+                                  needs_iteration=self.config.needs_iteration_num)
+
+        if self.config.access_pattern == 'AB':
+            macro_secondary = runner.get_kernel('ShanChenPrepareMacroFields', macro_args2,
+                                                macro_signature,
+                                                needs_iteration=self.config.needs_iteration_num)
+            macro_pair = KernelPair(macro, macro_secondary)
+        else:
+            macro_pair = KernelPair(macro, macro)
+
+        # TODO(michalj): These kernels can actually run in parallel.
+        primary = [
+            runner.get_kernel('ShanChenCollideAndPropagate0', args1a,
+                              args_a_signature,
+                              needs_iteration=self.config.needs_iteration_num),
+            runner.get_kernel('ShanChenCollideAndPropagate1', args1b,
+                              args_b_signature,
+                              needs_iteration=self.config.needs_iteration_num)
+        ]
+
+        if self.config.access_pattern == 'AB':
+            secondary = [
+                runner.get_kernel('ShanChenCollideAndPropagate0', args2a,
+                                  args_a_signature,
+                                  needs_iteration=self.config.needs_iteration_num),
+                runner.get_kernel('ShanChenCollideAndPropagate1', args2b,
+                                  args_b_signature,
+                                  needs_iteration=self.config.needs_iteration_num)
+            ]
+            sim_pair = KernelPair(primary, secondary)
+        else:
+            sim_pair = KernelPair(primary, primary)
+
+        return zip(macro_pair, sim_pair)
diff --git a/sailfish/lb_single.py b/sailfish/lb_single.py
index 5136806b..adb92409 100644
--- a/sailfish/lb_single.py
+++ b/sailfish/lb_single.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from sailfish import subdomain_runner, sym, sym_equilibrium
-from sailfish.lb_base import LBSim, LBForcedSim, ScalarField, VectorField
+from sailfish.lb_base import LBSim, LBForcedSim, ScalarField, VectorField, KernelPair
 
 
 MacroKernels = namedtuple('MacroKernels', 'distributions macro')
@@ -71,13 +71,6 @@ def initial_conditions(self, runner):
             runner.exec_kernel('SetInitialConditions', args2, 'P'*len(args2))
 
     def get_compute_kernels(self, runner, full_output, bulk):
-        """
-        :param runner: SubdomainRunner object
-        :param full_output: if True, returns kernels that prepare fields for
-                visualization or saving into a file
-        :param bulk: if True, returns kernels that process the bulk domain,
-                otherwise returns kernels that process the subdomain boundary
-        """
         gpu_rho = runner.gpu_field(self.rho)
         gpu_v = runner.gpu_field(self.v)
         gpu_dist1a = runner.gpu_dist(0, 0)
@@ -109,21 +102,18 @@ def get_compute_kernels(self, runner, full_output, bulk):
             args2.append(runner.gpu_field(self.alpha))
             signature += 'P'
 
-        kernels = []
-
-        kernels.append(runner.get_kernel(
-                'CollideAndPropagate', args1, signature,
-                needs_iteration=self.config.needs_iteration_num))
+        cnp_primary = runner.get_kernel(
+            'CollideAndPropagate', args1, signature,
+            needs_iteration=self.config.needs_iteration_num)
 
         if self.config.access_pattern == 'AB':
             secondary_args = args2 if self.config.access_pattern == 'AB' else args1
-            kernels.append(runner.get_kernel(
-                    'CollideAndPropagate', secondary_args, signature,
-                    needs_iteration=self.config.needs_iteration_num))
+            cnp_secondary = runner.get_kernel(
+                'CollideAndPropagate', secondary_args, signature,
+                needs_iteration=self.config.needs_iteration_num)
+            return KernelPair([cnp_primary], [cnp_secondary])
         else:
-            kernels.append(kernels[-1])
-
-        return kernels
+            return KernelPair([cnp_primary], [cnp_primary])
 
     def get_pbc_kernels(self, runner):
         gpu_dist1a = runner.gpu_dist(0, 0)
diff --git a/sailfish/subdomain_runner.py b/sailfish/subdomain_runner.py
index e980b2a2..58d04a60 100644
--- a/sailfish/subdomain_runner.py
+++ b/sailfish/subdomain_runner.py
@@ -766,9 +766,9 @@ def step(self, sync_req):
 
     def _get_bulk_kernel(self, sync_req):
         if sync_req:
-            kernel = self._kernels_bulk_full[self._sim.iteration & 1]
+            kernel = self._kernels_bulk_full[self._sim.iteration & 1][0]
         else:
-            kernel = self._kernels_bulk_none[self._sim.iteration & 1]
+            kernel = self._kernels_bulk_none[self._sim.iteration & 1][0]
 
         return kernel, self._kernel_grid_bulk
 
@@ -824,9 +824,9 @@ def _step_boundary(self, sync_req):
 
         if self._boundary_blocks is not None:
             if sync_req:
-                kernel = self._kernels_bnd_full[self._sim.iteration & 1]
+                kernel = self._kernels_bnd_full[self._sim.iteration & 1][0]
             else:
-                kernel = self._kernels_bnd_none[self._sim.iteration & 1]
+                kernel = self._kernels_bnd_none[self._sim.iteration & 1][0]
             grid = self._boundary_blocks
         else:
             kernel, grid = self._get_bulk_kernel(sync_req)
@@ -1224,15 +1224,13 @@ def restore_checkpoint(self, fname):
 
             self._debug_set_dist(v, is_primary, dist_num)
 
-    def _prepare_compute_kernels(self, initial=False):
-        self._kernels_bulk_full = self._sim.get_compute_kernels(self, True,
-                                                                True)
-        self._kernels_bulk_none = self._sim.get_compute_kernels(self, False,
-                                                                True)
-        self._kernels_bnd_full = self._sim.get_compute_kernels(self, True,
-                                                               False)
-        self._kernels_bnd_none = self._sim.get_compute_kernels(self, False,
-                                                               False)
+    def _prepare_compute_kernels(self):
+        gck = self._sim.get_compute_kernels
+
+        self._kernels_bulk_full = gck(self, True, True)
+        self._kernels_bulk_none = gck(self, False, True)
+        self._kernels_bnd_full = gck(self, True, False)
+        self._kernels_bnd_none = gck(self, False, False)
 
     def run(self):
         self.config.logger.info("Initializing subdomain.")
@@ -1676,9 +1674,11 @@ def step(self, sync_req):
         # Actual simulation step.
         record_gpu_start(TimeProfile.BOUNDARY, str_calc)
         if has_boundary_split:
-            run(bnd_kernel_sim, grid_bnd, str_calc)
+            for k in bnd_kernel_sim:
+                run(k, grid_bnd, str_calc)
         else:
-            run(bulk_kernel_sim, grid_bulk, str_calc)
+            for k in bulk_kernel_sim:
+                run(k, grid_bulk, str_calc)
         ev = record_gpu_end(TimeProfile.BOUNDARY, str_calc)
         str_data.wait_for_event(ev)
 
@@ -1689,7 +1689,8 @@ def step(self, sync_req):
 
         record_gpu_start(TimeProfile.BULK, str_calc)
         if has_boundary_split:
-            run(bulk_kernel_sim, grid_bulk, str_calc)
+            for k in bulk_kernel_sim:
+                run(k, grid_bulk, str_calc)
         self._apply_pbc(self._pbc_kernels.distributions)
         record_gpu_end(TimeProfile.BULK, str_calc)
 
diff --git a/sailfish/sym_force.py b/sailfish/sym_force.py
index c490bbc2..ef90ee20 100644
--- a/sailfish/sym_force.py
+++ b/sailfish/sym_force.py
@@ -33,7 +33,6 @@ def needs_accel(i, forces, force_couplings):
     if type(forces) is Undefined:
         return False
 
-    # TODO: handle symbolic forces here
     return ((i in forces.numeric or i in forces.symbolic)
             or needs_coupling_accel(i, force_couplings))
 
diff --git a/sailfish/templates/binary_fluid.mako b/sailfish/templates/binary_fluid.mako
index cfb9f81a..f72c7628 100644
--- a/sailfish/templates/binary_fluid.mako
+++ b/sailfish/templates/binary_fluid.mako
@@ -3,29 +3,38 @@
     import sailfish.node_type as nt
     import sympy
 %>
+<%
+	# Necessary to support force reassignment in the free energy MRT model.
+    phi_needs_rho = (force_for_eq is not UNDEFINED and force_for_eq.get(1) == 0)
+%>
 
-<%def name="bgk_args_decl_sc()">
-	float rho, float phi, float *iv0, float *ea0, float *ea1
-</%def>
-
-<%def name="bgk_args_decl_fe()">
-	float rho, float phi, float lap1, float *iv0, float *grad1
+<%def name="bgk_args_decl_sc(grid_idx)">
+	%if grid_idx == 0:
+		float rho, float *iv0, float *ea${grid_idx}
+	%else:
+		float phi, float *iv0, float *ea${grid_idx}
+	%endif
 </%def>
 
-<%def name="bgk_args_sc()">
-	g0m0, g1m0, v, sca0, sca1
+<%def name="bgk_args_decl_fe(grid_idx)">
+	%if grid_idx == 0:
+		float rho, float phi, float lap1, float *iv0, float *grad1
+	%else:
+		${'float rho, ' if phi_needs_rho else ''} float phi, float lap1, float *iv0
+	%endif
 </%def>
 
-<%def name="bgk_args_fe()">
-	g0m0, g1m0, lap1, v, grad1
+<%def name="bgk_args_fe(grid_idx)">
+	%if grid_idx == 0:
+		g0m0, g1m0, lap1, v, grad1
+	%else:
+		${'g0m0, ' if phi_needs_rho else ''} g1m0, lap1, v
+	%endif
 </%def>
 
-// In the free-energy model, the relaxation time is a local quantity.
 %if simtype == 'shan-chen':
+	## In the free-energy model, the relaxation time is a local quantity.
 	${const_var} float tau0 = ${tau}f;		// relaxation time
-%endif
-
-%if simtype == 'shan-chen':
 	// Relaxation time for the 2nd fluid component.
 %else:
 	// Relaxation time for the order parameter field.
@@ -47,7 +56,9 @@ ${const_var} float tau1 = ${tau_phi}f;
 <%namespace file="propagation.mako" import="*"/>
 <%namespace file="utils.mako" import="*"/>
 
+%if simtype == 'free-energy':
 <%include file="finite_difference_optimized.mako"/>
+%endif
 
 <%def name="init_dist_with_eq()">
 	%if simtype == 'free-energy':
@@ -95,7 +106,8 @@ ${kernel} void SetInitialConditions(
 	${init_dist_with_eq()}
 }
 
-${kernel} void PrepareMacroFields(
+%if simtype == 'free-energy':
+${kernel} void FreeEnergyPrepareMacroFields(
 	${global_ptr} int *map,
 	${global_ptr} float *dist1_in,
 	${global_ptr} float *dist2_in,
@@ -106,23 +118,7 @@ ${kernel} void PrepareMacroFields(
 	${iteration_number_if_required()})
 {
 	${local_indices_split()}
-
-	int ncode = map[gi];
-	int type = decodeNodeType(ncode);
-
-	// Unused nodes do not participate in the simulation.
-	if (isExcludedNode(type))
-		return;
-
-	int orientation = decodeNodeOrientation(ncode);
-
-	%if simtype == 'shan-chen':
-		// Do not update the macroscopic fields for nodes which do not
-		// represent any fluid.
-		if (!isWetNode(type)) {
-			return;
-		}
-	%endif
+	${load_node_type()}
 
 	// Do not not update the fields for pressure nodes, where by definition
 	// they are constant.
@@ -135,99 +131,86 @@ ${kernel} void PrepareMacroFields(
 	Dist fi;
 	float out;
 
-	%if sim._fields['rho'].abstract.need_nn:
-		getDist(&fi, dist1_in, gi ${iteration_number_arg_if_required()});
+	if (isWetNode(type)) {
+		getDist(&fi, dist2_in, gi ${iteration_number_arg_if_required()});
 		get0thMoment(&fi, type, orientation, &out);
-		orho[gi] = out;
-	%endif
+		ophi[gi] = out;
+	}
 
-	%if simtype == 'free-energy':
-		if (isWetNode(type)) {
-			getDist(&fi, dist2_in, gi ${iteration_number_arg_if_required()});
+	## Assume neutral wetting for all walls by adjusting the phase gradient
+	## near the wall.
+	##
+	## This wetting boundary condition implementation is as in option 2 in
+	## Halim Kusumaatmaja's PhD thesis, p.18.
+
+	## Symbols used on the schematics below:
+	##
+	## W: wall node (current node, pointed to by 'gi')
+	## F: fluid node
+	## |: actual location of the wall
+	## .: space between fluid nodes
+	## x: node from which data is read
+	## y: node to which data is being written
+	##
+	## The schematics assume a bc_wall_grad_order of 2.
+	%if nt.NTFullBBWall in node_types:
+		int helper_idx = gi;
+		## Full BB: F . F | W
+		##          x ----> y
+		if (isNTFullBBWall(type)) {
+			switch (orientation) {
+				%for dir in grid.dir2vecidx.keys():
+					case ${dir}: {  // ${grid.dir_to_vec(dir)}
+						%if dim == 3:
+							helper_idx += ${rel_offset(*(bc_wall_grad_order*grid.dir_to_vec(dir)))};
+						%else:
+							## rel_offset() needs a 3-vector, so make the z-coordinate 0
+							helper_idx += ${rel_offset(*(list(bc_wall_grad_order*grid.dir_to_vec(dir)) + [0]))};
+						%endif
+						break;
+					}
+				%endfor
+			}
+			getDist(&fi, dist2_in, helper_idx ${iteration_number_arg_if_required()});
 			get0thMoment(&fi, type, orientation, &out);
-			ophi[gi] = out;
+			ophi[gi] = out - (${bc_wall_grad_order*bc_wall_grad_phase});
 		}
-
-		## Assume neutral wetting for all walls by adjusting the phase gradient
-		## near the wall.
-		##
-		## This wetting boundary condition implementation is as in option 2 in
-		## Halim Kusumaatmaja's PhD thesis, p.18.
-
-		## Symbols used on the schematics below:
-		##
-		## W: wall node (current node, pointed to by 'gi')
-		## F: fluid node
-		## |: actual location of the wall
-		## .: space between fluid nodes
-		## x: node from which data is read
-		## y: node to which data is being written
-		##
-		## The schematics assume a bc_wall_grad_order of 2.
-		%if nt.NTFullBBWall in node_types:
-			int helper_idx = gi;
-			## Full BB: F . F | W
-			##          x ----> y
-			if (isNTFullBBWall(type)) {
-				switch (orientation) {
-					%for dir in grid.dir2vecidx.keys():
-						case ${dir}: {  // ${grid.dir_to_vec(dir)}
-							%if dim == 3:
-								helper_idx += ${rel_offset(*(bc_wall_grad_order*grid.dir_to_vec(dir)))};
-							%else:
-								## rel_offset() needs a 3-vector, so make the z-coordinate 0
-								helper_idx += ${rel_offset(*(list(bc_wall_grad_order*grid.dir_to_vec(dir)) + [0]))};
-							%endif
-							break;
-						}
-					%endfor
-				}
-				getDist(&fi, dist2_in, helper_idx ${iteration_number_arg_if_required()});
-				get0thMoment(&fi, type, orientation, &out);
-				ophi[gi] = out - (${bc_wall_grad_order*bc_wall_grad_phase});
-			}
-		%endif  ## NTFullBBWall
-		%if nt.NTHalfBBWall in node_types:
-			%if bc_wall_grad_order != 1:
-				__ONLY_FIRST_ORDER_GRADIENTS_ARE_SUPPORTED_FOR_HALF_BB_WETTING_WALLS__
-			%endif
-			int helper_idx = gi;
-
-			## Half-way  BB: F . W | U
-			##               x ----> y
-			if (isNTHalfBBWall(type)) {
-				switch (orientation) {
-					%for dir in grid.dir2vecidx.keys():
-						case ${dir}: {  // ${grid.dir_to_vec(dir)}
-							%if dim == 3:
-								helper_idx -= ${rel_offset(*(grid.dir_to_vec(dir)))};
-							%else:
-								helper_idx -= ${rel_offset(*(list(grid.dir_to_vec(dir)) + [0]))};
-							%endif
-							break;
-						}
-					%endfor
-				}
-
-				ophi[helper_idx] = out - (${bc_wall_grad_order*bc_wall_grad_phase});
-			}
+	%endif  ## NTFullBBWall
+	%if nt.NTHalfBBWall in node_types:
+		%if bc_wall_grad_order != 1:
+			__ONLY_FIRST_ORDER_GRADIENTS_ARE_SUPPORTED_FOR_HALF_BB_WETTING_WALLS__
 		%endif
-	%else:	## shan-chen
-		getDist(&fi, dist2_in, gi ${iteration_number_arg_if_required()});
-		get0thMoment(&fi, type, orientation, &out);
-		ophi[gi] = out;
+		int helper_idx = gi;
+
+		## Half-way  BB: F . W | U
+		##               x ----> y
+		if (isNTHalfBBWall(type)) {
+			switch (orientation) {
+				%for dir in grid.dir2vecidx.keys():
+					case ${dir}: {  // ${grid.dir_to_vec(dir)}
+						%if dim == 3:
+							helper_idx -= ${rel_offset(*(grid.dir_to_vec(dir)))};
+						%else:
+							helper_idx -= ${rel_offset(*(list(grid.dir_to_vec(dir)) + [0]))};
+						%endif
+						break;
+					}
+				%endfor
+			}
+
+			ophi[helper_idx] = out - (${bc_wall_grad_order*bc_wall_grad_phase});
+		}
 	%endif
 }
 
-${kernel} void CollideAndPropagate(
+${kernel} void FreeEnergyCollideAndPropagateFluid(
 	${global_ptr} int *map,
 	${global_ptr} float *dist1_in,
 	${global_ptr} float *dist1_out,
-	${global_ptr} float *dist2_in,
-	${global_ptr} float *dist2_out,
 	${global_ptr} float *gg0m0,
 	${global_ptr} float *gg1m0,
 	${kernel_args_1st_moment('ov')}
+	${global_ptr} float *gg1laplacian,
 	int options
 	${scratch_space_if_required()}
 	${iteration_number_if_required()})
@@ -237,74 +220,83 @@ ${kernel} void CollideAndPropagate(
 	${load_node_type()}
 	${guo_density_node_index_shift_intro()}
 
-	%if simtype == 'free-energy':
-		float lap1, grad1[${dim}];
+	float lap1, grad1[${dim}];
+	if (isWetNode(type)) {
+		laplacian_and_grad(gg1m0, 1, gi, &lap1, grad1, gx, gy ${', gz' if dim == 3 else ''});
+	}
 
-		if (isWetNode(type)) {
-			%if dim == 2:
-				laplacian_and_grad(gg1m0, 1, gi, &lap1, grad1, gx, gy);
-			%else:
-				laplacian_and_grad(gg1m0, 1, gi, &lap1, grad1, gx, gy, gz);
-			%endif
-		}
-	%elif simtype == 'shan-chen':
-		${sc_calculate_accel()}
-	%endif
+	// Macroscopic quantities for the current cell.
+	float g0m0, v[${dim}], g1m0;
 
-	// cache the distributions in local variables
-	Dist d0, d1;
+	// Cache the distributions in local variables.
+	Dist d0;
 	getDist(&d0, dist1_in, gi ${iteration_number_arg_if_required()});
-	getDist(&d1, dist2_in, gi ${iteration_number_arg_if_required()});
-
+	g1m0 = gg1m0[gi];
 	${guo_density_restore_index()}
 
-	// macroscopic quantities for the current cell
-	float g0m0, v[${dim}], g1m0;
+	getMacro(&d0, ncode, type, orientation, &g0m0, v ${dynamic_val_call_args()});
 
-	%if simtype == 'free-energy':
-		getMacro(&d0, ncode, type, orientation, &g0m0, v ${dynamic_val_call_args()});
-		// TODO(michalj): Is this really needed?
-		get0thMoment(&d1, type, orientation, &g1m0);
-	%elif simtype == 'shan-chen':
-		${sc_macro_fields()}
-	%endif
-
-	precollisionBoundaryConditions(&d0, ncode, type, orientation, &g0m0, v);
-	precollisionBoundaryConditions(&d1, ncode, type, orientation, &g1m0, v);
+	// Save laplacian and velocity to global memory so that they can be reused
+	// in the relaxation of the order parameter field.
+	ovx[gi] = v[0];
+	ovy[gi] = v[1];
+	${'ovz[gi] = v[2]' if dim == 3 else ''};
+	gg1laplacian[gi] = lap1;
 
-	%if simtype == 'shan-chen':
-		${relaxate(bgk_args_sc)}
-	%elif simtype == 'free-energy':
-		${relaxate(bgk_args_fe)}
+	%if phi_needs_rho:
+		gg0m0[gi] = g0m0;
 	%endif
 
-	// FIXME: In order for the half-way bounce back boundary condition to work, a layer of unused
-	// nodes currently has to be placed behind the wall layer.
+	precollisionBoundaryConditions(&d0, ncode, type, orientation, &g0m0, v);
+	${relaxate(bgk_args_fe, 0)}
 	postcollisionBoundaryConditions(&d0, ncode, type, orientation, &g0m0, v, gi, dist1_out);
-	postcollisionBoundaryConditions(&d1, ncode, type, orientation, &g1m0, v, gi, dist2_out);
-
 	${guo_density_node_index_shift_final()}
+	${check_invalid_values()}
+	${save_macro_fields(velocity=False)}
+	${propagate('dist1_out', 'd0')}
+}
 
-	checkInvalidValues(&d0, ${position()});
-	checkInvalidValues(&d1, ${position()});
-
-	// Only save the macroscopic quantities if requested to do so.
-	if (options & OPTION_SAVE_MACRO_FIELDS) {
-		%if simtype == 'shan-chen':
-			if (isWetNode(type))
-		%endif
-		{
-			ovx[gi] = v[0];
-			ovy[gi] = v[1];
-			%if dim == 3:
-				ovz[gi] = v[2];
-			%endif
-		}
-	}
-
+${kernel} void FreeEnergyCollideAndPropagateOrderParam(
+	${global_ptr} int *map,
+	${global_ptr} float *dist1_in,
+	${global_ptr} float *dist1_out,
+	${global_ptr + ' float *gg0m0,' if phi_needs_rho else ''}
+	${global_ptr} float *gg1m0,
+	${kernel_args_1st_moment('ov')}
+	${global_ptr} float *gg1laplacian,
+	int options
+	${scratch_space_if_required()}
+	${iteration_number_if_required()})
+{
+	${local_indices_split()}
+	${shared_mem_propagation_vars()}
+	${load_node_type()}
+	${guo_density_node_index_shift_intro()}
+	// Cache the distributions in local variables.
+	Dist d0;
+	getDist(&d0, dist1_in, gi ${iteration_number_arg_if_required()});
+	${guo_density_restore_index()}
+	float lap1 = gg1laplacian[gi];
+	float g1m0, v[${dim}];
+	${'float g0m0 = gg0m0[gi];' if phi_needs_rho else ''}
+
+	v[0] = ovx[gi];
+	v[1] = ovy[gi];
+	${'v[2] = ovz[gi]' if dim == 3 else ''};
+	g1m0 = gg1m0[gi];
+
+	precollisionBoundaryConditions(&d0, ncode, type, orientation, &g1m0, v);
+	${relaxate(bgk_args_fe, 1)}
+	postcollisionBoundaryConditions(&d0, ncode, type, orientation, &g1m0, v, gi, dist1_out);
+	${guo_density_node_index_shift_final()}
+	${check_invalid_values()}
 	${propagate('dist1_out', 'd0')}
-	${barrier()}
-	${propagate('dist2_out', 'd1')}
 }
 
+%endif  ## free-energy
+
+%if simtype == 'shan-chen':
+<%include file="binary_shan_chen.mako"/>
+%endif  ## shan-chen
+
 <%include file="util_kernels.mako"/>
diff --git a/sailfish/templates/binary_shan_chen.mako b/sailfish/templates/binary_shan_chen.mako
new file mode 100644
index 00000000..26d9fff2
--- /dev/null
+++ b/sailfish/templates/binary_shan_chen.mako
@@ -0,0 +1,121 @@
+<%!
+    from sailfish import sym
+    import sailfish.node_type as nt
+    import sympy
+%>
+
+<%namespace file="opencl_compat.mako" import="*" name="opencl_compat"/>
+<%namespace file="utils.mako" import="*"/>
+<%namespace file="kernel_common.mako" import="*" name="kernel_common"/>
+<%namespace file="code_common.mako" import="*"/>
+<%namespace file="boundary.mako" import="*" name="boundary"/>
+<%namespace file="relaxation.mako" import="*" name="relaxation"/>
+<%namespace file="propagation.mako" import="*"/>
+<%namespace file="shan_chen.mako" import="*" name="shan_chen"/>
+
+<%def name="bgk_args_sc(grid_idx)">
+	g${grid_idx}m0, v, sca0
+</%def>
+
+${kernel} void ShanChenPrepareMacroFields(
+	${global_ptr} int *map,
+	${global_ptr} float *dist1_in,
+	${global_ptr} float *dist2_in,
+	${global_ptr} float *orho0,
+	${global_ptr} float *orho1,
+	${kernel_args_1st_moment('ov')}
+	int options
+	${scratch_space_if_required()}
+	${iteration_number_if_required()})
+{
+	${local_indices_split()}
+	${load_node_type()}
+
+	// Do not update the macroscopic fields for nodes which do not
+	// represent any fluid.
+	if (!isWetNode(type)) {
+		return;
+	}
+
+	// Do not not update the fields for pressure nodes, where by definition
+	// they are constant.
+	%for node_type in (nt.NTGuoDensity, nt.NTEquilibriumDensity, nt.NTZouHeDensity):
+		%if node_type in node_types:
+			if (is${node_type.__name__}(type)) { return; }
+		%endif
+	%endfor
+
+	Dist fi;
+	float rho0, rho1;
+	float v[${dim}];
+
+	// Velocity requires input from both lattices, so we calculate it here so
+	// that it can just be read out in the collision kernel.
+	v[0] = 0.0f;
+	v[1] = 0.0f;
+	${'v[2] = 0.0f;' if dim == 3 else ''}
+
+	getDist(&fi, dist1_in, gi ${iteration_number_arg_if_required()});
+	get0thMoment(&fi, type, orientation, &rho0);
+	compute_1st_moment(&fi, v, 0, 1.0f/tau0);
+	orho0[gi] = rho0;
+
+	// TODO: try moving this line earlier and see what the performance impact is.
+	getDist(&fi, dist2_in, gi ${iteration_number_arg_if_required()});
+	get0thMoment(&fi, type, orientation, &rho1);
+	compute_1st_moment(&fi, v, 1, 1.0f/tau1);
+	orho1[gi] = rho1;
+
+	// Velocity becomes a weighted average of the values for invidual components.
+	float total_rho = rho0 / tau0 + rho1 / tau1;
+	%for i in range(0, dim):
+		v[${i}] /= total_rho;
+	%endfor
+
+	ovx[gi] = v[0];
+	ovy[gi] = v[1];
+	${'ovz[gi] = v[2];' if dim == 3 else ''}
+}
+
+%for grid_idx in range(0, 2):
+${kernel} void ShanChenCollideAndPropagate${grid_idx}(
+	${global_ptr} int *map,
+	${global_ptr} float *dist1_in,
+	${global_ptr} float *dist1_out,
+	${global_ptr} float *gg0m0,
+	${global_ptr} float *gg1m0,
+	${kernel_args_1st_moment('ov')}
+	int options
+	${scratch_space_if_required()}
+	${iteration_number_if_required()})
+{
+	${local_indices_split()}
+	${shared_mem_propagation_vars()}
+	${load_node_type()}
+	${guo_density_node_index_shift_intro()}
+	float g${grid_idx}m0 = gg${grid_idx}m0[gi];
+	${sc_calculate_force(grid_idx=grid_idx)}
+
+	// Cache the distributions in local variables.
+	Dist d0;
+	getDist(&d0, dist1_in, gi ${iteration_number_arg_if_required()});
+
+	// Macroscopic quantities for the current cell.
+	float v[${dim}];
+	v[0] = ovx[gi];
+	v[1] = ovy[gi];
+	${'v[2] = ovz[gi]' if dim == 3 else ''};
+
+	${guo_density_restore_index()}
+
+	precollisionBoundaryConditions(&d0, ncode, type, orientation, &g${grid_idx}m0, v);
+	${relaxate(bgk_args_sc, grid_idx)}
+
+	// FIXME: In order for the half-way bounce back boundary condition to work, a layer of unused
+	// nodes currently has to be placed behind the wall layer.
+	postcollisionBoundaryConditions(&d0, ncode, type, orientation, &g${grid_idx}m0, v, gi, dist1_out);
+	${guo_density_node_index_shift_final()}
+	${check_invalid_values()}
+	${propagate('dist1_out', 'd0')}
+}
+%endfor
diff --git a/sailfish/templates/boundary.mako b/sailfish/templates/boundary.mako
index 0c1480a7..a77e143c 100644
--- a/sailfish/templates/boundary.mako
+++ b/sailfish/templates/boundary.mako
@@ -52,13 +52,14 @@ ${device_func} inline void node_param_get_vector(const int idx, float *out
 ${device_func} inline float node_param_get_scalar(const int idx ${dynamic_val_args_decl()}) {
 	%if (time_dependence or space_dependence) and symbol_idx_map:
 		if (idx >= ${non_symbolic_idxs}) {
-			float out;
 			switch (idx) {
 				%for key, val in symbol_idx_map.iteritems():
 					%if len(val) == 1:
-						case ${key}:
+						case ${key}: {
+							float out;
 							time_dep_param_${key}(&out ${dynamic_val_args()});
 							return out;
+						}
 					%endif
 				%endfor
 				default:
diff --git a/sailfish/templates/kernel_common.mako b/sailfish/templates/kernel_common.mako
index eb61a70f..f5d493bc 100644
--- a/sailfish/templates/kernel_common.mako
+++ b/sailfish/templates/kernel_common.mako
@@ -1,5 +1,6 @@
 <%!
-    from sailfish import sym
+  from sailfish import sym
+  import sailfish.node_type as nt
 %>
 
 <%page args="bgk_args_decl"/>
@@ -124,6 +125,30 @@
 	int orientation = decodeNodeOrientation(ncode);
 </%def>
 
+<%def name="check_invalid_values()">
+	## Grad outflow nodes use invalid values to tag directions lacking distribution
+	## data.
+	if (isWetNode(type) ${'&& !isNTGradFreeflow(type)' if nt.NTGradFreeflow in node_types else ''}) {
+		checkInvalidValues(&d0, ${position()});
+	}
+</%def>
+
+<%def name="save_macro_fields(density=True, velocity=True)">
+	// Only save the macroscopic quantities if requested to do so.
+	if ((options & OPTION_SAVE_MACRO_FIELDS) && isWetNode(type)
+		## Nodes using the Grad approximation use the velocity from the
+		## previous time step to compute the approximated distributions.
+		${'|| isNTGradFreeflow(type)' if nt.NTGradFreeflow in node_types else ''}) {
+		gg0m0[gi] = g0m0 ${' +1.0f' if config.minimize_roundoff else ''};
+
+		%if not initialization and velocity:
+			ovx[gi] = v[0];
+			ovy[gi] = v[1];
+			${'ovz[gi] = v[2]' if dim == 3 else ''};
+		%endif
+	}
+</%def>
+
 ## Defines local indices for bulk kernels.
 ## This is the same as local_indices(), but with proper offsets to skip
 ## the boundary.
@@ -410,10 +435,10 @@
 <%namespace file="relaxation.mako" import="*" name="relaxation"/>
 %endif
 
-<%def name="position_decl()">
-	int gx, int gy
+<%def name="position_decl(prefix='g')">
+	int ${prefix}x, int ${prefix}y
 	%if dim == 3:
-		, int gz
+		, int ${prefix}z
 	%endif
 </%def>
 
diff --git a/sailfish/templates/relaxation.mako b/sailfish/templates/relaxation.mako
index e0e5356b..cb9d8381 100644
--- a/sailfish/templates/relaxation.mako
+++ b/sailfish/templates/relaxation.mako
@@ -14,43 +14,44 @@ ${mrt.body()}
 %endif
 
 %if model == 'mrt' and simtype == 'free-energy':
-${device_func} inline void FE_MRT_relaxate(${bgk_args_decl()},
-%for i in range(0, len(grids)):
-	Dist *d${i},
-%endfor
+%for grid_idx in range(0, len(grids)):
+${device_func} inline void FE_MRT_relaxate${grid_idx}(${bgk_args_decl(grid_idx)}, Dist *d0,
 	int node_type
 	${dynamic_val_args_decl()})
 {
-	${bgk_relaxation_preamble()}
+	${body_force(force_for_eq.get(grid_idx, grid_idx))}
+	${bgk_relaxation_preamble(grid_idx)}
+	## The acceleration vector needs to declared if no force was set in the
+	## previous call to body_force().
+	${body_force(grid_idx, vector_decl=(force_for_eq.get(grid_idx, -1) is None))}
 
-	%for i in range(0, len(grids)):
-		%for idx in grid.idx_name:
-			## Use the BGK approximation for the relaxation of the order parameter field.
-			%if i == 1:
-				d${i}->${idx} += (feq${i}.${idx} - d${i}->${idx}) / tau${i};
-			%else:
-				feq${i}.${idx} = d${i}->${idx} - feq${i}.${idx};
-			%endif
-		%endfor
+	%for idx in grid.idx_name:
+		## Use the BGK approximation for the relaxation of the order parameter field.
+		%if grid_idx == 1:
+			d0->${idx} += (feq0.${idx} - d0->${idx}) / tau1;
+		%else:
+			feq0.${idx} = d0->${idx} - feq0.${idx};
+		%endif
 	%endfor
 
-	%for dst, src in sym.free_energy_mrt(sim.grid, 'd0', 'feq0'):
-		${dst} -= ${sym.make_float(src)};
-	%endfor
+	%if grid_idx == 0:
+		%for dst, src in sym.free_energy_mrt(sim.grid, 'd0', 'feq0'):
+			${dst} -= ${sym.make_float(src)};
+		%endfor
+	%endif
 
-	%for i in range(0, len(grids)):
-		## Is there a force acting on the current grid?
-		%if sym_force.needs_accel(i, forces, force_couplings):
-			${fluid_velocity(i)};
+	## Is there a force acting on the current grid?
+	%if sym_force.needs_accel(grid_idx, forces, force_couplings):
+		${fluid_velocity(grid_idx)};
 
-			%for val, idx in zip(sym_force.free_energy_external_force(sim, grid_num=i), grid.idx_name):
-				d${i}->${idx} += ${cex(val)};
-			%endfor
-		%endif
-	%endfor
+		%for val, idx in zip(sym_force.free_energy_external_force(sim, grid_num=grid_idx), grid.idx_name):
+			d0->${idx} += ${cex(val)};
+		%endfor
+	%endif
 
-	${fluid_output_velocity()}
+	${fluid_output_velocity(grid_idx)}
 }
+%endfor
 %endif  ## model == mrt && simtype == 'free-energy'
 
 %if model == 'elbm':
@@ -76,7 +77,7 @@ ${device_func} inline void ELBM_relaxate(${bgk_args_decl()}, Dist* d0
 	%>
 
 	float v0[${dim}];
-	${body_force()}
+	${body_force(grid_idx=0)}
 	${fluid_velocity(0)};
 
 	## Local variables used by the equilibrium.
@@ -109,7 +110,7 @@ ${device_func} inline void ELBM_relaxate(${bgk_args_decl()}, Dist* d0
 		d0->${idx} += alpha * fneq0.${idx};
 	%endfor
 
-	${fluid_output_velocity()}
+	${fluid_output_velocity(0)}
 }
 %endif  ## model == elbm
 
@@ -117,46 +118,42 @@ ${device_func} inline void ELBM_relaxate(${bgk_args_decl()}, Dist* d0
 //
 // Performs the relaxation step in the BGK model given the density rho,
 // the velocity v and the distribution fi.
-//
-${device_func} inline void BGK_relaxate(${bgk_args_decl()},
-%for i in range(0, len(grids)):
-	Dist *d${i},
-%endfor
-	int node_type, int ncode
+%for grid_idx in range(len(grids)):
+${device_func} inline void BGK_relaxate${grid_idx}(${bgk_args_decl(grid_idx)},
+	Dist *d0, int node_type, int ncode
 	${dynamic_val_args_decl()})
 {
-	${bgk_relaxation_preamble()}
+	${body_force(grid_idx)}
+	${bgk_relaxation_preamble(grid_idx)}
 
-	%for i in range(0, len(grids)):
-		%for idx in grid.idx_name:
-			d${i}->${idx} += (feq${i}.${idx} - d${i}->${idx}) / tau${i};
-		%endfor
+	%for idx in grid.idx_name:
+		d0->${idx} += (feq0.${idx} - d0->${idx}) / tau${grid_idx};
+	%endfor
 
-		%if nt.NTGuoDensity in node_types:
-			// The total form of the postcollision boundary node distribution value
-			// with the Guo boundary conditions is as follows:
-			//
-			// f_post(O) = f_eq(O) + f(B) - f_eq(B) + omega * (f_eq(B) - f(B))
-			//
-			// where O is the boundary node and B is the fluid node pointed to by the
-			// boundary node normal vector.  The Guo boudary condtiions are implemented
-			// so that all the standard processing proceeds for node B first, and the
-			// correction for node O is added as a postcollision boundary condition.
-			//
-			// The code below takes care of the -f_eq(B) of the formula.
-			if (isNTGuoDensity(node_type)) {
-				%for idx in grid.idx_name:
-					d${i}->${idx} -= feq${i}.${idx};
-				%endfor
-			}
-		%endif
+	%if nt.NTGuoDensity in node_types:
+		// The total form of the postcollision boundary node distribution value
+		// with the Guo boundary conditions is as follows:
+		//
+		// f_post(O) = f_eq(O) + f(B) - f_eq(B) + omega * (f_eq(B) - f(B))
+		//
+		// where O is the boundary node and B is the fluid node pointed to by the
+		// boundary node normal vector.  The Guo boudary condtiions are implemented
+		// so that all the standard processing proceeds for node B first, and the
+		// correction for node O is added as a postcollision boundary condition.
+		//
+		// The code below takes care of the -f_eq(B) of the formula.
+		if (isNTGuoDensity(node_type)) {
+			%for idx in grid.idx_name:
+				d0->${idx} -= feq0.${idx};
+			%endfor
+		}
+	%endif
 
-		## Is there a force acting on the current grid?
-		%if sym_force.needs_accel(i, forces, force_couplings):
-			${fluid_velocity(i)};
-			${apply_body_force(i)};
-		%endif
-	%endfor
+	## Is there a force acting on the current grid?
+	%if sym_force.needs_accel(grid_idx, forces, force_couplings):
+		${fluid_velocity(grid_idx)};
+		${apply_body_force(grid_idx)};
+	%endif
 
 	// FIXME: This should be moved to postcollision boundary conditions.
 	%if nt.NTGuoDensity in node_types:
@@ -166,35 +163,27 @@ ${device_func} inline void BGK_relaxate(${bgk_args_decl()},
 			float par_phi = 1.0f;
 			tau0 = tau_a;
 
-			%for i, eq in enumerate([f(g, config) for f, g, in zip(equilibria, grids)]):
-				%for local_var in eq.local_vars:
-					const float ${cex(local_var.lhs)} =
-						${cex(local_var.rhs, rho='par_rho', phi='par_phi')};
-				%endfor
-				%for feq, idx in zip(eq.expression, grid.idx_name):
-					d${i}->${idx} += ${cex(feq, rho='par_rho', phi='par_phi')};
-				%endfor
+			<% eq = equilibria[grid_idx](grids[grid_idx], config) %>
+			%for local_var in eq.local_vars:
+				const float ${cex(local_var.lhs)} =
+					${cex(local_var.rhs, rho='par_rho', phi='par_phi')};
+			%endfor
+			%for feq, idx in zip(eq.expression, grid.idx_name):
+				d0->${idx} += ${cex(feq, rho='par_rho', phi='par_phi')};
 			%endfor
 		}
 	%endif
 
-	${fluid_output_velocity()}
+	${fluid_output_velocity(grid_idx)}
 }
+%endfor
 %endif
 
-<%def name="_relaxate(bgk_args)">
+<%def name="_relaxate(bgk_args, grid_idx)">
 	%if model == 'bgk':
-		BGK_relaxate(${bgk_args()},
-%for i in range(0, len(grids)):
-	&d${i},
-%endfor
-	type, ncode ${dynamic_val_call_args()});
+		BGK_relaxate${grid_idx}(${bgk_args(grid_idx)}, &d0, type, ncode ${dynamic_val_call_args()});
 	%elif model == 'mrt' and simtype == 'free-energy':
-		FE_MRT_relaxate(${bgk_args()},
-%for i in range(0, len(grids)):
-	&d${i},
-%endfor
-	type ${dynamic_val_call_args()});
+		FE_MRT_relaxate${grid_idx}(${bgk_args(grid_idx)}, &d0, type ${dynamic_val_call_args()});
 	%elif model == 'elbm':
 		ELBM_relaxate(${bgk_args()}, &d0 ${dynamic_val_call_args()} ${cond(alpha_output, ', options, alpha + gi')});
 	%else:
@@ -202,10 +191,10 @@ ${device_func} inline void BGK_relaxate(${bgk_args_decl()},
 	%endif
 </%def>
 
-<%def name="relaxate(bgk_args)">
+<%def name="relaxate(bgk_args, grid_idx=0)">
 	%if relaxation_enabled:
 		if (isWetNode(type)) {
-			${_relaxate(bgk_args)}
+			${_relaxate(bgk_args, grid_idx)}
 		}
 	%endif
 </%def>
diff --git a/sailfish/templates/relaxation_common.mako b/sailfish/templates/relaxation_common.mako
index f801def4..70581862 100644
--- a/sailfish/templates/relaxation_common.mako
+++ b/sailfish/templates/relaxation_common.mako
@@ -5,30 +5,30 @@
 <%namespace file="code_common.mako" import="*"/>
 
 ## Defines the actual acceleration vectors.
-<%def name="body_force()">
+<%def name="body_force(grid_idx, vector_decl=True)">
 	%if forces is not UNDEFINED and (forces.numeric or forces.symbolic):
-		// Body force acceleration.
-		%if forces.symbolic and time_dependence:
-			float phys_time = get_time_from_iteration(iteration_number);
-		%endif
-		%for i in range(0, len(grids)):
-			%if sym_force.needs_accel(i, forces, {}):
-				%if not sym_force.needs_coupling_accel(i, force_couplings):
-					float ea${i}[${dim}];
-					%for j in range(0, dim):
-						ea${i}[${j}] = ${cex(sym_force.body_force_accel(i, j, forces, accel=True))};
-					%endfor
-				%else:
-					## If the current grid has a Shan-Chen force acting on it, the acceleration vector
-					## is already externally defined in the Shan-Chen code.
-					%for j in range(0, dim):
-						%if i in forces.symbolic or i in forces.numeric:
-							ea${i}[${j}] += ${cex(sym_force.body_force_accel(i, j, forces, accel=True))};
-						%endif
-					%endfor
+		%if sym_force.needs_accel(grid_idx, forces, {}):
+			%if forces.symbolic and time_dependence:
+				float phys_time = get_time_from_iteration(iteration_number);
+			%endif
+			// Body force acceleration.
+			%if not sym_force.needs_coupling_accel(grid_idx, force_couplings):
+				%if vector_decl:
+					float ea${grid_idx}[${dim}];
 				%endif
+				%for j in range(0, dim):
+					ea${grid_idx}[${j}] = ${cex(sym_force.body_force_accel(grid_idx, j, forces, accel=True))};
+				%endfor
+			%else:
+				## If the current grid has a Shan-Chen force acting on it, the acceleration vector
+				## is already externally defined in the Shan-Chen code.
+				%for j in range(0, dim):
+					%if grid_idx in forces.symbolic or grid_idx in forces.numeric:
+						ea${grid_idx}[${j}] += ${cex(sym_force.body_force_accel(grid_idx, j, forces, accel=True))};
+					%endif
+				%endfor
 			%endif
-		%endfor
+		%endif
 	%endif
 </%def>
 
@@ -37,7 +37,7 @@
 	// Guo's method, eqs. 19 and 20 from 10.1103/PhysRevE.65.046308.
 	const float pref = ${cex(sym_force.guo_external_force_pref(grids[i], config, grid_num=i))};
 	%for val, idx in zip(sym_force.guo_external_force(grid, grid_num=i), grid.idx_name):
-		d${i}->${idx} += ${cex(val)};
+		d0->${idx} += ${cex(val)};
 	%endfor
 }
 </%def>
@@ -46,7 +46,7 @@
 {
 	// Exact difference method.
 	%for feq_shifted, idx in zip(sym_force.edm_shift_velocity(equilibria[i](grids[i], config).expression), grid.idx_name):
-		d${i}->${idx} += ${cex(feq_shifted)} - feq${i}.${idx};
+		d0->${idx} += ${cex(feq_shifted)} - feq${i}.${idx};
 	%endfor
 }
 </%def>
@@ -57,7 +57,7 @@
 <%def name="apply_body_force(i)">
 	%if simtype == 'free-energy':
 		%for val, idx in zip(sym_force.free_energy_external_force(sim, grid_num=i), grid.idx_name):
-			d${i}->${idx} += ${cex(val)};
+			d0->${idx} += ${cex(val)};
 		%endfor
 	%else:
 		%if force_implementation == 'guo':
@@ -104,10 +104,10 @@
 	%endif
 </%def>
 
-<%def name="update_relaxation_time()">
+<%def name="update_relaxation_time(grid_idx)">
 	## In models where the relaxation time is constant everywhere, tau0 is a global
 	## constant and does not need to be declared here.
-	%if simtype == 'free-energy':
+	%if simtype == 'free-energy' and grid_idx == 0:
 		// Linear interpolation of relaxation time.
 		float tau0 = tau_b + (phi + 1.0f) * (tau_a - tau_b) / 2.0f;
 		if (phi < -1.0f) {
@@ -119,60 +119,54 @@
 
 	%if subgrid == 'les-smagorinsky':
 		// Compute relaxation time using the standard viscosity-relaxation time relation.
-		%for i in range(0, len(grids)):
-			float tau${i} = 0.5f + 3.0f * visc;
-		%endfor
+		float tau0 = 0.5f + 3.0f * visc;
 
-		## FIXME: This will not work properly for multifluid models.
+		// TODO(michalj): Fix this for multifluid models.
 		// Modify the relaxation time proportionally to the modulus of the local strain rate tensor.
 		// The 2nd order tensor formed from the equilibrium distributions is rho / 3 * \delta_{ab} + rho u_a u_b
 		{
 			float tmp, strain;
 
-			%for i in range(0, len(grids)):
-				strain = 0.0f;
-
-				// Off-diagonal components, count twice for symmetry reasons.
-				%for a in range(0, dim):
-					%for b in range(a+1, dim):
-						 tmp = ${cex(sym.ex_flux(grid, 'd%d' % i, a, b, config), pointers=True)} -
-							   ${cex(sym.S.rho * grid.v[a] * grid.v[b])};
-						 strain += 2.0f * tmp * tmp;
-					%endfor
-				%endfor
+			strain = 0.0f;
 
-				// Diagonal components.
-				%for a in range(0, dim):
-					tmp = ${cex(sym.ex_flux(grid, 'd%d' % i, a, a, config), pointers=True)} -
-						  ${cex(sym.S.rho * (grid.v[a] * grid.v[b] + grid.cssq))};
-					strain += tmp * tmp;
+			// Off-diagonal components, count twice for symmetry reasons.
+			%for a in range(0, dim):
+				%for b in range(a+1, dim):
+					 tmp = ${cex(sym.ex_flux(grid, 'd0', a, b, config), pointers=True)} -
+						   ${cex(sym.S.rho * grid.v[a] * grid.v[b])};
+					 strain += 2.0f * tmp * tmp;
 				%endfor
+			%endfor
 
-				// Form of the relaxation time correction as in comp-gas/9401004v1.
-				tau${i} += (sqrtf(visc*visc + 18.0f * ${cex(smagorinsky_const**2)} *
-							sqrtf(strain)) - visc) / 2.0f;
+			// Diagonal components.
+			%for a in range(0, dim):
+				tmp = ${cex(sym.ex_flux(grid, 'd0', a, a, config), pointers=True)} -
+					  ${cex(sym.S.rho * (grid.v[a] * grid.v[b] + grid.cssq))};
+				strain += tmp * tmp;
 			%endfor
+
+			// Form of the relaxation time correction as in comp-gas/9401004v1.
+			tau0 += (sqrtf(visc*visc + 18.0f * ${cex(smagorinsky_const**2)} *
+					  sqrtf(strain)) - visc) / 2.0f;
 		}
 	%endif
 </%def>
 
 ## Code common to all BGK-like relaxation models.
-<%def name="bgk_relaxation_preamble()">
+<%def name="bgk_relaxation_preamble(grid_idx=0)">
 	float v0[${dim}];
-	${body_force()}
 
-	%for i, eq in enumerate([f(g, config) for f, g in zip(equilibria, grids)]):
-		Dist feq${i};
-		${fluid_velocity(i, equilibrium=True)};
+	<% eq = equilibria[grid_idx](grids[grid_idx], config) %>
+	Dist feq0;
+	${fluid_velocity(grid_idx, equilibrium=True)};
 
-		%for local_var in eq.local_vars:
-			float ${cex(local_var.lhs)} = ${cex(local_var.rhs)};
-		%endfor
+	%for local_var in eq.local_vars:
+		float ${cex(local_var.lhs)} = ${cex(local_var.rhs)};
+	%endfor
 
-		%for feq, idx in zip(eq.expression, grid.idx_name):
-			feq${i}.${idx} = ${cex(feq)};
-		%endfor
+	%for feq, idx in zip(eq.expression, grid.idx_name):
+		feq0.${idx} = ${cex(feq)};
 	%endfor
 
-	${update_relaxation_time()}
+	${update_relaxation_time(grid_idx)}
 </%def>
diff --git a/sailfish/templates/shan_chen.mako b/sailfish/templates/shan_chen.mako
index 3ac59ccd..7931f241 100644
--- a/sailfish/templates/shan_chen.mako
+++ b/sailfish/templates/shan_chen.mako
@@ -4,60 +4,28 @@
 
 <%namespace file="utils.mako" import="get_field_loc,get_field_off"/>
 <%namespace file="code_common.mako" import="cex"/>
+<%namespace file="kernel_common.mako" import="*"/>
 
-<%def name="sc_calculate_accel()">
-##
-## Declare and evaluate the Shan-Chen accelerations.
-##
-	%for x in set(sum(force_couplings.keys(), ())):
-		float sca${x}[${dim}];
-	%endfor
+## Declares and evaluate the Shan-Chen forces.
+<%def name="sc_calculate_force(grid_idx=0)">
+	float sca0[${dim}];
 
 	if (isWetNode(type)) {
 		%for dists, coupling_const in force_couplings.iteritems():
-			// Interaction force between two components.
-			%if dists[0] != dists[1]:
-				%if dim == 2:
-					shan_chen_force(gi, gg${dists[0]}m0, gg${dists[1]}m0,
-						${coupling_const}, sca${dists[0]}, sca${dists[1]}, gx, gy);
-				%else:
-					shan_chen_force(gi, gg${dists[0]}m0, gg${dists[1]}m0,
-						${coupling_const}, sca${dists[0]}, sca${dists[1]}, gx, gy, gz);
-				%endif
-			// Self-interaction force of a single component.
-			%else:
-				%if dim == 2:
-					shan_chen_force_self(gi, gg${dists[0]}m0, ${coupling_const}, sca${dists[0]}, gx, gy);
-				%else:
-					shan_chen_force_self(gi, gg${dists[0]}m0, ${coupling_const}, sca${dists[0]}, gx, gy, gz);
-				%endif
+			%if dists[0] == grid_idx:
+				shan_chen_force(gi, g${grid_idx}m0, gg${dists[1]}m0,
+								${coupling_const}, sca0, ${position()});
+			%elif dists[1] == grid_idx:
+				shan_chen_force(gi, g${grid_idx}m0, gg${dists[0]}m0,
+								${coupling_const}, sca0, ${position()});
 			%endif
 		%endfor
-	}
-</%def>
-
-<%def name="sc_macro_fields()">
-	// Calculates the density and velocity for the Shan-Chen coupled fields.
-	// Density and velocity become weighted averages of the values for invidual components.
-	float total_dens;
 
-	%for i, x in enumerate(set(sum(force_couplings.keys(), ()))):
-		get0thMoment(&d${x}, type, orientation, &g${x}m0);
-		compute_1st_moment(&d${x}, v, ${i}, 1.0f/tau${x});
-	%endfor
-
-	total_dens = 0.0f;
-	%for x in set(sum(force_couplings.keys(), ())):
-		total_dens += g${x}m0 / tau${x};
-	%endfor
-
-	// Convert momentum and force into velocity and acceleration.
-	%for i in range(0, dim):
-		%for x in set(sum(force_couplings.keys(), ())):
-			sca${x}[${i}] /= g${x}m0;
+		// Convert momentum and force into velocity and acceleration.
+		%for i in range(0, dim):
+			sca0[${i}] /= g${grid_idx}m0;
 		%endfor
-		v[${i}] /= total_dens;
-	%endfor
+	}
 </%def>
 
 // Calculates the Shan-Chan pseudopotential.
@@ -67,80 +35,24 @@ ${device_func} inline float sc_ppot(${global_ptr} float *field, int gi)
 	return ${cex(sym.SHAN_CHEN_POTENTIALS[sc_potential]('lfield'))};
 }
 
-// Calculates the Shan-Chen force between a single fluid component (self-interaction).
-// The form of the interaction is the same as that of a force between two components (see below).
-${device_func} inline void shan_chen_force_self(int i, ${global_ptr} float *f1,
-		float cc, float *a1, int x, int y
-%if dim == 3:
-	, int z
-%endif
-)
-{
-	float t1;
-
-	%for i in range(0, dim):
-		a1[${i}] = 0.0f;
-	%endfor
-
-	%if block.envelope_size != 0:
-		int off;
-	%endif
-
-	int gi;		// global index
-
-	%for i, ve in enumerate(grid.basis):
-		%if ve.dot(ve) != 0.0:
-			// ${ve}
-			%if block.envelope_size == 0:
-				${get_field_loc(*ve)};
-			%else:
-				${get_field_off(*ve)}
-				gi = i + off;
-			%endif
-
-			t1 = sc_ppot(f1, gi);
-
-			%if ve[0] != 0:
-				a1[0] += t1 * ${ve[0] * grid.weights[i]};
-			%endif
-			%if ve[1] != 0:
-				a1[1] += t1 * ${ve[1] * grid.weights[i]};
-			%endif
-			%if dim == 3 and ve[2] != 0:
-				a1[2] += t1 * ${ve[2] * grid.weights[i]};
-			%endif
-		%endif
-	%endfor
-
-	// Local node -- no offset.
-	t1 = sc_ppot(f1, i);
-
-	%for i in range(0, dim):
-		a1[${i}] *= t1 * cc;
-	%endfor
-}
-
 // Calculates the Shan-Chen force between two fluid components.
 //
 //  F = -G * \phi_A(x) \sum_i w_i e_i \phi_B(x + e_i)
 //
 // i: global node index
+// rho: density at the current node
+// field: (density) fielf of the other fluid component
 // f1, f2: fields
 // cc: coupling constant
-// a1, a2: Shan-Chen accelerations (output variables)
+// force: Shan-Chen force (output variable)
 // x, y, [z]: position of the node
-${device_func} inline void shan_chen_force(int i, ${global_ptr} float *f1, ${global_ptr} float *f2,
-float cc, float *a1, float *a2, int x, int y
-%if dim == 3:
-	, int z
-%endif
-)
+${device_func} inline void shan_chen_force(int i, float rho, ${global_ptr} float *field,
+float cc, float *force, ${position_decl(prefix='')})
 {
-	float t1, t2;
+	float psi;
 
 	%for i in range(0, dim):
-		a1[${i}] = 0.0f;
-		a2[${i}] = 0.0f;
+		force[${i}] = 0.0f;
 	%endfor
 
 	%if block.envelope_size != 0:
@@ -159,30 +71,20 @@ float cc, float *a1, float *a2, int x, int y
 				gi = i + off;
 			%endif
 
-			t1 = sc_ppot(f1, gi);
-			t2 = sc_ppot(f2, gi);
+			psi = sc_ppot(field, gi);
 
-			%if ve[0] != 0:
-				a1[0] += t2 * ${ve[0] * grid.weights[i]};
-				a2[0] += t1 * ${ve[0] * grid.weights[i]};
-			%endif
-			%if ve[1] != 0:
-				a1[1] += t2 * ${ve[1] * grid.weights[i]};
-				a2[1] += t1 * ${ve[1] * grid.weights[i]};
-			%endif
-			%if dim == 3 and ve[2] != 0:
-				a1[2] += t2 * ${ve[2] * grid.weights[i]};
-				a2[2] += t1 * ${ve[2] * grid.weights[i]};
-			%endif
+			%for j, component in enumerate(ve):
+				%if component != 0:
+					force[${j}] += psi * ${component * grid.weights[i]};
+				%endif
+			%endfor
 		%endif
 	%endfor
 
 	// Local node -- no offset.
-	t1 = sc_ppot(f1, i);
-	t2 = sc_ppot(f2, i);
+	psi = ${cex(sym.SHAN_CHEN_POTENTIALS[sc_potential]('rho'))};
 
 	%for i in range(0, dim):
-		a1[${i}] *= - t1 * cc;
-		a2[${i}] *= - t2 * cc;
+		force[${i}] *= - psi * cc;
 	%endfor
 }
diff --git a/sailfish/templates/single_fluid.mako b/sailfish/templates/single_fluid.mako
index c6158ddc..c527cc30 100644
--- a/sailfish/templates/single_fluid.mako
+++ b/sailfish/templates/single_fluid.mako
@@ -7,14 +7,14 @@
 	${const_var} float gravity = ${gravity}f;
 %endif
 
-<%def name="bgk_args_decl()">
+<%def name="bgk_args_decl(grid_idx=0)">
 	float rho, float *iv0
 	%if simtype == 'shan-chen':
 		, float *ea0
 	%endif
 </%def>
 
-<%def name="bgk_args()">
+<%def name="bgk_args(grid_idx=0)">
 	g0m0, v
 	%if simtype == 'shan-chen':
 		, sca0
@@ -174,17 +174,12 @@ ${kernel} void CollideAndPropagate(
 							ovx, ovy ${', ovz' if dim == 3 else ''}, gg0m0
 							${scratch_space_arg_if_required()});
 
-	%if simtype == 'shan-chen':
-		${sc_calculate_accel()}
-	%endif
-
 	// Macroscopic quantities for the current cell
 	float g0m0, v[${dim}];
+	getMacro(&d0, ncode, type, orientation, &g0m0, v ${dynamic_val_call_args()});
 
 	%if simtype == 'shan-chen':
-		${sc_macro_fields()}
-	%else:
-		getMacro(&d0, ncode, type, orientation, &g0m0, v ${dynamic_val_call_args()});
+		${sc_calculate_force()}
 	%endif
 
 	precollisionBoundaryConditions(&d0, ncode, type, orientation, &g0m0, v);
@@ -198,29 +193,8 @@ ${kernel} void CollideAndPropagate(
 	${relaxate(bgk_args)}
 	postcollisionBoundaryConditions(&d0, ncode, type, orientation, &g0m0, v, gi, dist_out
 									${scratch_space_arg_if_required()});
-
-	## Grad outflow nodes use invalid values to tag directions lacking distribution
-	## data.
-	if (isWetNode(type) ${'&& !isNTGradFreeflow(type)' if nt.NTGradFreeflow in node_types else ''}) {
-		checkInvalidValues(&d0, ${position()});
-	}
-
-	// Only save the macroscopic quantities if requested to do so.
-	if ((options & OPTION_SAVE_MACRO_FIELDS)
-		## Nodes using the Grad approximation use the velocity from the
-		## previous time step to compute the approximated distributions.
-		${'|| isNTGradFreeflow(type)' if nt.NTGradFreeflow in node_types else ''}) {
-		gg0m0[gi] = g0m0 ${' +1.0f' if config.minimize_roundoff else ''};
-
-		%if not initialization:
-			ovx[gi] = v[0];
-			ovy[gi] = v[1];
-			%if dim == 3:
-				ovz[gi] = v[2];
-			%endif
-		%endif
-	}
-
+	${check_invalid_values()}
+	${save_macro_fields()}
 	${propagate('dist_out', 'd0')}
 }