diff --git a/pymc3_hmm/distributions.py b/pymc3_hmm/distributions.py
index 1c7ed45..0b8ebad 100644
--- a/pymc3_hmm/distributions.py
+++ b/pymc3_hmm/distributions.py
@@ -1,21 +1,9 @@
 import warnings
 
 import numpy as np
-
-try:  # pragma: no cover
-    import aesara
-    import aesara.tensor as at
-    from aesara.graph.op import get_test_value
-    from aesara.scalar import upcast
-    from aesara.tensor.extra_ops import broadcast_to as at_broadcast_to
-except ImportError:  # pragma: no cover
-    import theano as aesara
-    import theano.tensor as at
-    from theano.graph.op import get_test_value
-    from theano.scalar import upcast
-    from theano.tensor.extra_ops import broadcast_to as at_broadcast_to
-
 import pymc3 as pm
+import theano
+import theano.tensor as tt
 from pymc3.distributions.distribution import (
     Distribution,
     _DrawValuesContext,
@@ -23,6 +11,9 @@
     generate_samples,
 )
 from pymc3.distributions.mixture import _conversion_map, all_discrete
+from theano.graph.op import get_test_value
+from theano.scalar import upcast
+from theano.tensor.extra_ops import broadcast_to as tt_broadcast_to
 
 from pymc3_hmm.utils import tt_broadcast_arrays, vsearchsorted
 
@@ -62,7 +53,7 @@ def distribution_subset_args(dist, shape, idx):
     res = dict()
     for param in dist_param_names:
 
-        bcast_res = at_broadcast_to(getattr(dist, param), shape)
+        bcast_res = tt_broadcast_to(getattr(dist, param), shape)
 
         res[param] = bcast_res[idx]
 
@@ -110,7 +101,7 @@ def __init__(self, comp_dists, states, *args, **kwargs):
             equal to the size of `comp_dists`.
 
         """
-        self.states = at.as_tensor_variable(pm.intX(states))
+        self.states = tt.as_tensor_variable(pm.intX(states))
 
         if len(comp_dists) > 31:
             warnings.warn(
@@ -136,7 +127,7 @@ def __init__(self, comp_dists, states, *args, **kwargs):
                 bcast_means = tt_broadcast_arrays(
                     *([self.states] + [d.mean.astype(dtype) for d in self.comp_dists])
                 )
-                self.mean = at.choose(self.states, bcast_means[1:])
+                self.mean = tt.choose(self.states, bcast_means[1:])
 
                 if "mean" not in defaults:
                     defaults.append("mean")
@@ -148,7 +139,7 @@ def __init__(self, comp_dists, states, *args, **kwargs):
             bcast_modes = tt_broadcast_arrays(
                 *([self.states] + [d.mode.astype(dtype) for d in self.comp_dists])
             )
-            self.mode = at.choose(self.states, bcast_modes[1:])
+            self.mode = tt.choose(self.states, bcast_modes[1:])
 
             if "mode" not in defaults:
                 defaults.append("mode")
@@ -161,16 +152,16 @@ def __init__(self, comp_dists, states, *args, **kwargs):
     def logp(self, obs):
         """Return the Theano log-likelihood at a point."""
 
-        obs_tt = at.as_tensor_variable(obs)
+        obs_tt = tt.as_tensor_variable(obs)
 
-        logp_val = at.alloc(-np.inf, *obs.shape)
+        logp_val = tt.alloc(-np.inf, *obs.shape)
 
         for i, dist in enumerate(self.comp_dists):
-            i_mask = at.eq(self.states, i)
+            i_mask = tt.eq(self.states, i)
             obs_i = obs_tt[i_mask]
             subset_dist_kwargs = distribution_subset_args(dist, obs.shape, i_mask)
             subset_dist = dist.dist(**subset_dist_kwargs)
-            logp_val = at.set_subtensor(logp_val[i_mask], subset_dist.logp(obs_i))
+            logp_val = tt.set_subtensor(logp_val[i_mask], subset_dist.logp(obs_i))
 
         return logp_val
 
@@ -255,8 +246,8 @@ def __init__(self, mu=None, states=None, **kwargs):
             A vector of integer 0-1 states that indicate which component of
             the mixture is active at each point/time.
         """
-        self.mu = at.as_tensor_variable(pm.floatX(mu))
-        self.states = at.as_tensor_variable(states)
+        self.mu = tt.as_tensor_variable(pm.floatX(mu))
+        self.states = tt.as_tensor_variable(states)
 
         super().__init__(
             [Constant.dist(np.array(0, dtype=np.int64)), pm.Poisson.dist(mu)],
@@ -292,15 +283,15 @@ def __init__(self, Gammas, gamma_0, shape, **kwargs):
             Shape of the state sequence.  The last dimension is `N`, i.e. the
             length of the state sequence(s).
         """
-        self.gamma_0 = at.as_tensor_variable(pm.floatX(gamma_0))
+        self.gamma_0 = tt.as_tensor_variable(pm.floatX(gamma_0))
 
         assert Gammas.ndim >= 3
 
-        self.Gammas = at.as_tensor_variable(pm.floatX(Gammas))
+        self.Gammas = tt.as_tensor_variable(pm.floatX(Gammas))
 
         shape = np.atleast_1d(shape)
 
-        dtype = _conversion_map[aesara.config.floatX]
+        dtype = _conversion_map[theano.config.floatX]
         self.mode = np.zeros(tuple(shape), dtype=dtype)
 
         super().__init__(shape=shape, **kwargs)
@@ -324,23 +315,23 @@ def logp(self, states):
 
         """  # noqa: E501
 
-        states_tt = at.as_tensor(states)
+        states_tt = tt.as_tensor(states)
 
         if states.ndim > 1 or self.Gammas.ndim > 3 or self.gamma_0.ndim > 1:
             raise NotImplementedError("Broadcasting not supported.")
 
-        Gammas_tt = at_broadcast_to(
+        Gammas_tt = tt_broadcast_to(
             self.Gammas, (states.shape[0],) + tuple(self.Gammas.shape)[-2:]
         )
         gamma_0_tt = self.gamma_0
 
         Gamma_1_tt = Gammas_tt[0]
-        P_S_1_tt = at.dot(gamma_0_tt, Gamma_1_tt)[states_tt[0]]
+        P_S_1_tt = tt.dot(gamma_0_tt, Gamma_1_tt)[states_tt[0]]
 
         # def S_logp_fn(S_tm1, S_t, Gamma):
-        #     return at.log(Gamma[..., S_tm1, S_t])
+        #     return tt.log(Gamma[..., S_tm1, S_t])
         #
-        # P_S_2T_tt, _ = aesara.scan(
+        # P_S_2T_tt, _ = theano.scan(
         #     S_logp_fn,
         #     sequences=[
         #         {
@@ -350,10 +341,10 @@ def logp(self, states):
         #         Gammas_tt,
         #     ],
         # )
-        P_S_2T_tt = Gammas_tt[at.arange(1, states.shape[0]), states[:-1], states[1:]]
+        P_S_2T_tt = Gammas_tt[tt.arange(1, states.shape[0]), states[:-1], states[1:]]
 
-        log_P_S_1T_tt = at.concatenate(
-            [at.shape_padright(at.log(P_S_1_tt)), at.log(P_S_2T_tt)]
+        log_P_S_1T_tt = tt.concatenate(
+            [tt.shape_padright(tt.log(P_S_1_tt)), tt.log(P_S_2T_tt)]
         )
 
         res = log_P_S_1T_tt.sum()
@@ -424,7 +415,7 @@ class Constant(Distribution):
 
     def __init__(self, c, shape=(), defaults=("mode",), **kwargs):
 
-        c = at.as_tensor_variable(c)
+        c = tt.as_tensor_variable(c)
 
         dtype = c.dtype
 
@@ -448,7 +439,7 @@ def _random(c, dtype=self.dtype, size=None):
         )
 
     def logp(self, value):
-        return at.switch(at.eq(value, self.c), 0.0, -np.inf)
+        return tt.switch(tt.eq(value, self.c), 0.0, -np.inf)
 
     def _distr_parameters_for_repr(self):
         return ["c"]
diff --git a/pymc3_hmm/step_methods.py b/pymc3_hmm/step_methods.py
index 9719db9..00f4130 100644
--- a/pymc3_hmm/step_methods.py
+++ b/pymc3_hmm/step_methods.py
@@ -1,47 +1,28 @@
+from functools import singledispatch
 from itertools import chain
 from typing import Callable, Tuple
 
 import numpy as np
-
-try:  # pragma: no cover
-    import aesara.scalar as aes
-    import aesara.tensor as at
-    from aesara import config
-    from aesara.compile import optdb
-    from aesara.graph.basic import Variable, graph_inputs
-    from aesara.graph.fg import FunctionGraph
-    from aesara.graph.op import get_test_value as test_value
-    from aesara.graph.opt import OpRemove, pre_greedy_local_optimizer
-    from aesara.graph.optdb import Query
-    from aesara.scalar.basic import Dot
-    from aesara.sparse.basic import StructuredDot
-    from aesara.tensor.elemwise import DimShuffle, Elemwise
-    from aesara.tensor.subtensor import AdvancedIncSubtensor1
-    from aesara.tensor.var import TensorConstant
-except ImportError:  # pragma: no cover
-    import theano.scalar as aes
-    import theano.tensor as at
-    from theano.compile import optdb
-    from theano.graph.basic import Variable, graph_inputs
-    from theano.graph.fg import FunctionGraph
-    from theano.graph.op import get_test_value as test_value
-    from theano.graph.opt import OpRemove, pre_greedy_local_optimizer
-    from theano.graph.optdb import Query
-    from theano.tensor.elemwise import DimShuffle, Elemwise
-    from theano.tensor.subtensor import AdvancedIncSubtensor1
-    from theano.tensor.var import TensorConstant
-    from theano.tensor.basic import Dot
-    from theano.sparse.basic import StructuredDot
-    from theano import config
-
-from functools import singledispatch
-
 import pymc3 as pm
 import scipy
+import theano.scalar as ts
+import theano.tensor as tt
 from pymc3.distributions.distribution import draw_values
 from pymc3.step_methods.arraystep import ArrayStep, BlockedStep, Competence
 from pymc3.util import get_untransformed_name
 from scipy.stats import invgamma
+from theano import config
+from theano.compile import optdb
+from theano.graph.basic import Variable, graph_inputs
+from theano.graph.fg import FunctionGraph
+from theano.graph.op import get_test_value as test_value
+from theano.graph.opt import OpRemove, pre_greedy_local_optimizer
+from theano.graph.optdb import Query
+from theano.sparse.basic import StructuredDot
+from theano.tensor.basic import Dot
+from theano.tensor.elemwise import DimShuffle, Elemwise
+from theano.tensor.subtensor import AdvancedIncSubtensor1
+from theano.tensor.var import TensorConstant
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, HorseShoe, SwitchingProcess
 from pymc3_hmm.utils import compute_trans_freqs
@@ -185,7 +166,7 @@ def __init__(self, vars, values=None, model=None):
                 for comp_dist in dependent_rv.distribution.comp_dists:
                     comp_logps.append(comp_dist.logp(dependent_rv))
 
-                comp_logp_stacked = at.stack(comp_logps)
+                comp_logp_stacked = tt.stack(comp_logps)
             else:
                 raise TypeError(
                     "This sampler only supports `SwitchingProcess` observations"
@@ -193,7 +174,7 @@ def __init__(self, vars, values=None, model=None):
 
             dep_comps_logp_stacked.append(comp_logp_stacked)
 
-        comp_logp_stacked = at.sum(dep_comps_logp_stacked, axis=0)
+        comp_logp_stacked = tt.sum(dep_comps_logp_stacked, axis=0)
 
         (M,) = draw_values([var.distribution.gamma_0.shape[-1]], point=model.test_point)
         N = model.test_point[var.name].shape[-1]
@@ -352,9 +333,9 @@ def _set_row_mappings(self, Gamma, dir_priors, model):
         Gamma = pre_greedy_local_optimizer(
             FunctionGraph([], []),
             [
-                OpRemove(Elemwise(aes.Cast(aes.float32))),
-                OpRemove(Elemwise(aes.Cast(aes.float64))),
-                OpRemove(Elemwise(aes.identity)),
+                OpRemove(Elemwise(ts.Cast(ts.float32))),
+                OpRemove(Elemwise(ts.Cast(ts.float64))),
+                OpRemove(Elemwise(ts.identity)),
             ],
             Gamma,
         )
@@ -378,7 +359,7 @@ def _set_row_mappings(self, Gamma, dir_priors, model):
 
         Gamma_Join = Gamma_DimShuffle.inputs[0].owner
 
-        if not (isinstance(Gamma_Join.op, at.basic.Join)):
+        if not (isinstance(Gamma_Join.op, tt.basic.Join)):
             raise TypeError(
                 "The transition matrix should be comprised of stacked row vectors"
             )
@@ -546,7 +527,7 @@ def hs_regression_model_Normal(dist, rv, model):
     mu = dist.mu
     y_X_fn = None
     if hasattr(rv, "observations"):
-        obs = at.as_tensor_variable(rv.observations)
+        obs = tt.as_tensor_variable(rv.observations)
         obs_fn = model.fn(obs)
 
         def y_X_fn(points, X):
@@ -558,22 +539,22 @@ def y_X_fn(points, X):
 @hs_regression_model.register(pm.NegativeBinomial)
 def hs_regression_model_NegativeBinomial(dist, rv, model):
 
-    mu = at.as_tensor_variable(dist.mu)
+    mu = tt.as_tensor_variable(dist.mu)
 
-    if mu.owner and mu.owner.op == at.exp:
+    if mu.owner and mu.owner.op == tt.exp:
         eta = mu.owner.inputs[0]
     else:
         eta = mu
 
-    alpha = at.as_tensor_variable(dist.alpha)
+    alpha = tt.as_tensor_variable(dist.alpha)
     if hasattr(rv, "observations"):
         from polyagamma import random_polyagamma
 
-        obs = at.as_tensor_variable(rv.observations)
+        obs = tt.as_tensor_variable(rv.observations)
         h_z_alpha_fn = model.fn(
             [
                 alpha + obs,
-                eta.squeeze() - at.log(alpha),
+                eta.squeeze() - tt.log(alpha),
                 alpha,
                 obs,
             ]
@@ -617,7 +598,7 @@ def find_dot(node, beta, model, y_fn):
             return node, X_fn, y_fn
     else:
         # if exp transformation
-        if isinstance(node.owner.op, at.elemwise.Elemwise):
+        if isinstance(node.owner.op, tt.elemwise.Elemwise):
             res = find_dot(node.owner.inputs[0], beta, model, y_fn)
             if res:
                 node, X_fn, _ = res
diff --git a/pymc3_hmm/utils.py b/pymc3_hmm/utils.py
index 8f2047d..e05a077 100644
--- a/pymc3_hmm/utils.py
+++ b/pymc3_hmm/utils.py
@@ -3,21 +3,14 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import theano.tensor as tt
 from matplotlib import cm
 from matplotlib.axes import Axes
 from matplotlib.colors import Colormap
 from scipy.special import logsumexp
-
-try:  # pragma: no cover
-    import aesara.tensor as at
-    from aesara.tensor.extra_ops import broadcast_shape
-    from aesara.tensor.extra_ops import broadcast_to as at_broadcast_to
-    from aesara.tensor.var import TensorVariable
-except ImportError:  # pragma: no cover
-    import theano.tensor as at
-    from theano.tensor.extra_ops import broadcast_shape
-    from theano.tensor.extra_ops import broadcast_to as at_broadcast_to
-    from theano.tensor.var import TensorVariable
+from theano.tensor.extra_ops import broadcast_shape
+from theano.tensor.extra_ops import broadcast_to as tt_broadcast_to
+from theano.tensor.var import TensorVariable
 
 try:  # pragma: no cover
     import datashader as ds
@@ -44,8 +37,8 @@ def compute_steady_state(P):
 
     P = P[0]
     N_states = P.shape[-1]
-    Lam = (at.eye(N_states) - P + at.ones((N_states, N_states))).T
-    u = at.slinalg.solve(Lam, at.ones((N_states,)))
+    Lam = (tt.eye(N_states) - P + tt.ones((N_states, N_states))).T
+    u = tt.slinalg.solve(Lam, tt.ones((N_states,)))
     return u
 
 
@@ -95,15 +88,15 @@ def compute_trans_freqs(states, N_states, counts_only=False):
 
 def tt_logsumexp(x, axis=None, keepdims=False):
     """Construct a Theano graph for a log-sum-exp calculation."""
-    x_max_ = at.max(x, axis=axis, keepdims=True)
+    x_max_ = tt.max(x, axis=axis, keepdims=True)
 
     if x_max_.ndim > 0:
-        x_max_ = at.set_subtensor(x_max_[at.isinf(x_max_)], 0.0)
-    elif at.isinf(x_max_):
-        x_max_ = at.as_tensor(0.0)
+        x_max_ = tt.set_subtensor(x_max_[tt.isinf(x_max_)], 0.0)
+    elif tt.isinf(x_max_):
+        x_max_ = tt.as_tensor(0.0)
 
-    res = at.sum(at.exp(x - x_max_), axis=axis, keepdims=keepdims)
-    res = at.log(res)
+    res = tt.sum(tt.exp(x - x_max_), axis=axis, keepdims=keepdims)
+    res = tt.log(res)
 
     if not keepdims:
         # SciPy uses the `axis` keyword here, but Theano doesn't support that.
@@ -193,7 +186,7 @@ def tt_broadcast_arrays(*args: TensorVariable):
 
     """
     bcast_shape = broadcast_shape(*args)
-    return tuple(at_broadcast_to(a, bcast_shape) for a in args)
+    return tuple(tt_broadcast_to(a, bcast_shape) for a in args)
 
 
 def multilogit_inv(ys):
@@ -216,7 +209,7 @@ def multilogit_inv(ys):
         lib = np
         lib_logsumexp = logsumexp
     else:
-        lib = at
+        lib = tt
         lib_logsumexp = tt_logsumexp
 
     # exp_ys = lib.exp(ys)
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index fb4c599..ec38eaf 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -1,14 +1,8 @@
 import numpy as np
-
-try:
-    import aesara
-    import aesara.tensor as at
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
-
 import pymc3 as pm
 import pytest
+import theano
+import theano.tensor as tt
 
 from pymc3_hmm.distributions import (
     Constant,
@@ -22,8 +16,8 @@
 
 
 def test_DiscreteMarkovChain_str():
-    Gammas = at.as_tensor(np.eye(2)[None, ...], name="Gammas")
-    gamma_0 = at.as_tensor(np.r_[0, 1], name="gamma_0")
+    Gammas = tt.as_tensor(np.eye(2)[None, ...], name="Gammas")
+    gamma_0 = tt.as_tensor(np.r_[0, 1], name="gamma_0")
 
     with pm.Model():
         test_dist = DiscreteMarkovChain("P_rv", Gammas, gamma_0, shape=(2,))
@@ -188,7 +182,7 @@ def test_DiscreteMarkovChain_random():
 
 
 def test_DiscreteMarkovChain_point():
-    test_Gammas = at.as_tensor_variable(np.array([[[1.0, 0.0], [0.0, 1.0]]]))
+    test_Gammas = tt.as_tensor_variable(np.array([[[1.0, 0.0], [0.0, 1.0]]]))
 
     with pm.Model():
         # XXX: `draw_values` won't use the `Deterministic`s values in the `point` map!
@@ -335,7 +329,7 @@ def test_DiscreteMarkovChain_point():
     ],
 )
 def test_DiscreteMarkovChain_logp(Gammas, gamma_0, obs, exp_res):
-    aesara.config.compute_test_value = "warn"
+    theano.config.compute_test_value = "warn"
     test_dist = DiscreteMarkovChain.dist(Gammas, gamma_0, shape=obs.shape[-1])
     test_logp_tt = test_dist.logp(obs)
     test_logp_val = test_logp_tt.eval()
@@ -516,9 +510,9 @@ def test_SwitchingProcess():
     with pytest.raises(TypeError):
         SwitchingProcess.dist([1], test_states)
 
-    with aesara.change_flags(compute_test_value="off"):
+    with theano.change_flags(compute_test_value="off"):
         # Test for the case when a default can't be computed
-        test_dist = pm.Poisson.dist(at.scalar())
+        test_dist = pm.Poisson.dist(tt.scalar())
 
         # Confirm that there's no default
         with pytest.raises(AttributeError):
@@ -529,13 +523,13 @@ def test_SwitchingProcess():
             SwitchingProcess.dist([test_dist], test_states)
 
     # Evaluate multiple observed state sequences in an extreme case
-    test_states = at.imatrix("states")
+    test_states = tt.imatrix("states")
     test_states.tag.test_value = np.zeros((10, 4)).astype("int32")
     test_dist = SwitchingProcess.dist([Constant.dist(0), Constant.dist(1)], test_states)
     test_obs = np.tile(np.arange(4), (10, 1)).astype("int32")
     test_logp = test_dist.logp(test_obs)
     exp_logp = np.tile(
-        np.array([0.0] + [-np.inf] * 3, dtype=aesara.config.floatX), (10, 1)
+        np.array([0.0] + [-np.inf] * 3, dtype=theano.config.floatX), (10, 1)
     )
     assert np.array_equal(test_logp.tag.test_value, exp_logp)
 
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
index c84a872..7269c7f 100644
--- a/tests/test_estimation.py
+++ b/tests/test_estimation.py
@@ -4,26 +4,17 @@
 import numpy as np
 import pandas as pd
 import patsy
-
-try:
-    import aesara
-    import aesara.tensor as at
-    from aesara import shared
-
-    TV_CONFIG = {"aesara_config": {"compute_test_value": "ignore"}}
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
-    from theano import shared
-
-    TV_CONFIG = {"theano_config": {"compute_test_value": "ignore"}}
-
 import pymc3 as pm
+import theano
+import theano.tensor as tt
+from theano import shared
 
 from pymc3_hmm.distributions import Constant, DiscreteMarkovChain, SwitchingProcess
 from pymc3_hmm.step_methods import FFBSStep
 from pymc3_hmm.utils import multilogit_inv
 
+TV_CONFIG = {"theano_config": {"compute_test_value": "ignore"}}
+
 
 def gen_toy_data(days=-7 * 10):
     dates = date.today() + timedelta(days=days), date.today() + timedelta(days=2)
@@ -40,7 +31,7 @@ def gen_toy_data(days=-7 * 10):
 
 def create_dirac_zero_hmm(X, mu, xis, observed):
     S = 2
-    z_tt = at.stack([at.dot(X, xis[..., s, :]) for s in range(S)], axis=1)
+    z_tt = tt.stack([tt.dot(X, xis[..., s, :]) for s in range(S)], axis=1)
     Gammas_tt = pm.Deterministic("Gamma", multilogit_inv(z_tt))
     gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)), shape=S)
 
@@ -73,8 +64,8 @@ def test_only_positive_state():
         p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        Gammas_tt = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv])
+        Gammas_tt = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
 
         gamma_0_rv = pm.Dirichlet("gamma_0", np.ones((S,)), shape=S)
 
@@ -187,7 +178,7 @@ def test_time_varying_model():
 
     trace = posterior_trace.posterior.drop_vars(["Gamma", "V_t"])
 
-    with aesara.config.change_flags(compute_test_value="off"):
+    with theano.config.change_flags(compute_test_value="off"):
         adds_pois_ppc = pm.sample_posterior_predictive(
             trace, var_names=["V_t", "Y_t", "Gamma"], model=model
         )
diff --git a/tests/test_step_methods.py b/tests/test_step_methods.py
index 4c5508c..184b68a 100644
--- a/tests/test_step_methods.py
+++ b/tests/test_step_methods.py
@@ -1,22 +1,14 @@
 import warnings
 
 import numpy as np
-
-try:
-    import aesara.tensor as at
-    from aesara import shared
-    from aesara.graph.op import get_test_value
-    from aesara.sparse import structured_dot as sp_dot
-except ImportError:
-    import theano.tensor as at
-    from theano.graph.op import get_test_value
-    from theano.sparse import structured_dot as sp_dot
-    from theano import shared
-
 import pymc3 as pm
 import pytest
 import scipy as sp
+import theano.tensor as tt
 from pymc3.exceptions import SamplingError
+from theano import shared
+from theano.graph.op import get_test_value
+from theano.sparse import structured_dot as sp_dot
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, HorseShoe, PoissonZeroProcess
 from pymc3_hmm.step_methods import (
@@ -143,8 +135,8 @@ def test_FFBSStep():
         p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv])
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
 
         pi_0_tt = compute_steady_state(P_rv)
 
@@ -177,8 +169,8 @@ def test_FFBSStep_extreme():
         p_0_rv = poiszero_sim["p_0"]
         p_1_rv = poiszero_sim["p_1"]
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv])
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
 
         pi_0_tt = poiszero_sim["pi_0"]
 
@@ -241,8 +233,8 @@ def test_TransMatConjugateStep():
         p_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         p_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv])
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
 
         pi_0_tt = compute_steady_state(P_rv)
 
@@ -284,14 +276,14 @@ def test_TransMatConjugateStep_subtensors():
         d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        p_0_rv = at.as_tensor([0, 0, 1])
-        p_1_rv = at.zeros(3)
-        p_1_rv = at.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
-        p_2_rv = at.zeros(3)
-        p_2_rv = at.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
+        p_0_rv = tt.as_tensor([0, 0, 1])
+        p_1_rv = tt.zeros(3)
+        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
+        p_2_rv = tt.zeros(3)
+        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
 
-        P_tt = at.stack([p_0_rv, p_1_rv, p_2_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv, p_2_rv])
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
         DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10,))
 
         transmat = TransMatConjugateStep(P_rv)
@@ -308,16 +300,16 @@ def test_TransMatConjugateStep_subtensors():
         d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        p_0_rv = at.as_tensor([0, 0, 1])
-        p_1_rv = at.zeros(3)
-        p_1_rv = at.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
-        p_2_rv = at.zeros(3)
-        p_2_rv = at.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
+        p_0_rv = tt.as_tensor([0, 0, 1])
+        p_1_rv = tt.zeros(3)
+        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
+        p_2_rv = tt.zeros(3)
+        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
 
-        P_tt = at.horizontal_stack(
+        P_tt = tt.horizontal_stack(
             p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]
         )
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt.T))
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T))
         DiscreteMarkovChain("S_t", P_rv, np.r_[1, 0, 0], shape=(10,))
 
         transmat = TransMatConjugateStep(P_rv)
@@ -334,16 +326,16 @@ def test_TransMatConjugateStep_subtensors():
         d_0_rv = pm.Dirichlet("p_0", np.r_[1, 1], shape=2)
         d_1_rv = pm.Dirichlet("p_1", np.r_[1, 1], shape=2)
 
-        p_0_rv = at.as_tensor([0, 0, 1])
-        p_1_rv = at.zeros(3)
-        p_1_rv = at.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
-        p_2_rv = at.zeros(3)
-        p_2_rv = at.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
+        p_0_rv = tt.as_tensor([0, 0, 1])
+        p_1_rv = tt.zeros(3)
+        p_1_rv = tt.set_subtensor(p_0_rv[[0, 2]], d_0_rv)
+        p_2_rv = tt.zeros(3)
+        p_2_rv = tt.set_subtensor(p_1_rv[[1, 2]], d_1_rv)
 
-        P_tt = at.horizontal_stack(
+        P_tt = tt.horizontal_stack(
             p_0_rv[..., None], p_1_rv[..., None], p_2_rv[..., None]
         )
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt.T))
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt.T))
         DiscreteMarkovChain(
             "S_t", P_rv, np.r_[1, 0, 0], shape=(4,), observed=np.r_[0, 1, 0, 2]
         )
@@ -499,7 +491,7 @@ def test_HSStep_sparse():
     M = X.shape[1]
     with pm.Model():
         beta = HorseShoe("beta", tau=1, shape=M)
-        pm.Normal("y", mu=sp_dot(X, at.shape_padright(beta)), sigma=1, observed=y)
+        pm.Normal("y", mu=sp_dot(X, tt.shape_padright(beta)), sigma=1, observed=y)
         hsstep = HSStep([beta])
         trace = pm.sample(
             draws=50,
@@ -526,7 +518,7 @@ def test_HSStep_NegativeBinomial():
     N_draws = 500
     with pm.Model():
         beta = HorseShoe("beta", tau=1, shape=M)
-        pm.NegativeBinomial("y", mu=at.exp(beta.dot(X.T)), alpha=1, observed=y_nb)
+        pm.NegativeBinomial("y", mu=tt.exp(beta.dot(X.T)), alpha=1, observed=y_nb)
         hsstep = HSStep([beta])
         trace = pm.sample(
             draws=N_draws,
@@ -558,7 +550,7 @@ def test_HSStep_NegativeBinomial():
     with pm.Model():
         beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1)
         eta = pm.NegativeBinomial("eta", mu=beta.dot(X.T), alpha=1, shape=N)
-        pm.Normal("y", mu=at.exp(eta), sigma=1, observed=y_nb)
+        pm.Normal("y", mu=tt.exp(eta), sigma=1, observed=y_nb)
 
         with pytest.raises(SamplingError):
             HSStep([beta])
@@ -585,7 +577,7 @@ def test_HSStep_NegativeBinomial_sparse():
     with pm.Model():
         beta = HorseShoe("beta", tau=1, shape=M)
         pm.NegativeBinomial(
-            "y", mu=at.exp(sp_dot(X, at.shape_padright(beta))), alpha=1, observed=y_nb
+            "y", mu=tt.exp(sp_dot(X, tt.shape_padright(beta))), alpha=1, observed=y_nb
         )
         hsstep = HSStep([beta])
         trace = pm.sample(
@@ -619,7 +611,7 @@ def test_HSStep_NegativeBinomial_sparse_shared_y():
         beta = HorseShoe("beta", tau=1, shape=M)
         pm.NegativeBinomial(
             "y",
-            mu=at.exp(sp_dot(X_tt, at.shape_padright(beta))),
+            mu=tt.exp(sp_dot(X_tt, tt.shape_padright(beta))),
             alpha=1,
             observed=y_tt,
         )
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d9b62dd..df7f400 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,13 +1,8 @@
 import numpy as np
 import pytest
 import scipy as sp
-
-try:
-    import aesara
-    import aesara.tensor as at
-except ImportError:
-    import theano as aesara
-    import theano.tensor as at
+import theano
+import theano.tensor as tt
 
 from pymc3_hmm.utils import (
     compute_trans_freqs,
@@ -40,7 +35,7 @@ def test_compute_trans_freqs():
 )
 def test_logsumexp(test_input):
     np_res = sp.special.logsumexp(test_input)
-    tt_res = tt_logsumexp(at.as_tensor_variable(test_input)).eval()
+    tt_res = tt_logsumexp(tt.as_tensor_variable(test_input)).eval()
     assert np.array_equal(np_res, tt_res)
 
 
@@ -69,33 +64,31 @@ def test_logdotexp():
     assert np.allclose(A.dot(b), np.exp(test_res))
 
 
+@theano.config.change_flags(compute_test_value="warn")
+@np.errstate(over="ignore", under="ignore")
 def test_tt_logdotexp():
 
-    np.seterr(over="ignore", under="ignore")
-
-    aesara.config.compute_test_value = "warn"
-
     A = np.c_[[1.0, 2.0], [3.0, 4.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2], [30.0]].T
-    A_tt = at.as_tensor_variable(A)
-    b_tt = at.as_tensor_variable(b)
-    test_res = tt_logdotexp(at.log(A_tt), at.log(b_tt)).eval()
+    A_tt = tt.as_tensor_variable(A)
+    b_tt = tt.as_tensor_variable(b)
+    test_res = tt_logdotexp(tt.log(A_tt), tt.log(b_tt)).eval()
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2, 30.0]
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = tt_logdotexp(tt.log(A), tt.log(b)).eval()
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     A = np.c_[[1.0, 2.0], [10.0, 20.0]]
     b = np.c_[[0.1], [0.2]].T
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = tt_logdotexp(tt.log(A), tt.log(b)).eval()
     assert test_res.shape == (2, 1)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
     b = np.r_[0.1, 0.2]
-    test_res = tt_logdotexp(at.log(A), at.log(b)).eval()
+    test_res = tt_logdotexp(tt.log(A), tt.log(b)).eval()
     assert test_res.shape == (2,)
     assert np.allclose(A.dot(b), np.exp(test_res))
 
@@ -122,6 +115,6 @@ def test_multilogit_inv(test_input, test_output):
     assert np.array_equal(res.round(2), test_output)
 
     # Theano testing
-    res = multilogit_inv(at.as_tensor_variable(test_input))
+    res = multilogit_inv(tt.as_tensor_variable(test_input))
     res = res.eval()
     assert np.array_equal(res.round(2), test_output)
diff --git a/tests/utils.py b/tests/utils.py
index c339ba1..29f742b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,11 +1,6 @@
 import numpy as np
-
-try:
-    import aesara.tensor as at
-except ImportError:
-    import theano.tensor as at
-
 import pymc3 as pm
+import theano.tensor as tt
 
 from pymc3_hmm.distributions import DiscreteMarkovChain, PoissonZeroProcess
 
@@ -18,8 +13,8 @@ def simulate_poiszero_hmm(
         p_0_rv = pm.Dirichlet("p_0", p_0_a, shape=np.shape(pi_0_a))
         p_1_rv = pm.Dirichlet("p_1", p_1_a, shape=np.shape(pi_0_a))
 
-        P_tt = at.stack([p_0_rv, p_1_rv])
-        P_rv = pm.Deterministic("P_tt", at.shape_padleft(P_tt))
+        P_tt = tt.stack([p_0_rv, p_1_rv])
+        P_rv = pm.Deterministic("P_tt", tt.shape_padleft(P_tt))
 
         pi_0_tt = pm.Dirichlet("pi_0", pi_0_a, shape=np.shape(pi_0_a))