diff --git a/libpympb/pympb.cpp b/libpympb/pympb.cpp
index c6033cb99..5bee3941d 100644
--- a/libpympb/pympb.cpp
+++ b/libpympb/pympb.cpp
@@ -369,12 +369,9 @@ int mode_solver::mean_epsilon(symmetric_matrix *meps, symmetric_matrix *meps_inv
 
   bool o1_is_var = o1 && meep_geom::is_variable(o1->material);
   bool o2_is_var = o2 && meep_geom::is_variable(o2->material);
-  bool default_is_var_or_file =
-      meep_geom::is_variable(default_material) || meep_geom::is_file(default_material);
+  bool default_is_var = meep_geom::is_variable(default_material);
 
-  if (o1_is_var || o2_is_var ||
-      (default_is_var_or_file &&
-       (!o1 || !o2 || meep_geom::is_file(o1->material) || meep_geom::is_file(o2->material)))) {
+  if (o1_is_var || o2_is_var || (default_is_var && (!o1 || !o2))) {
     return 0; /* arbitrary material functions are non-analyzable */
   }
 
diff --git a/python/adjoint/filters.py b/python/adjoint/filters.py
index 262f37937..e77b66855 100644
--- a/python/adjoint/filters.py
+++ b/python/adjoint/filters.py
@@ -6,6 +6,10 @@
 from scipy import signal, special
 
 import meep as mp
+from scipy import special
+from scipy import signal
+import skfmm
+from scipy import interpolate
 
 
 def _proper_pad(x, n):
@@ -874,3 +878,20 @@ def gray_indicator(x):
     density-based topology optimization. Archive of Applied Mechanics, 86(1-2), 189-218.
     """
     return npa.mean(4 * x.flatten() * (1 - x.flatten())) * 100
+
+
+def make_sdf(data):
+    """
+    Assume the input, data, is the desired output shape
+    (i.e. 1d, 2d, or 3d) and that it's values are between
+    0 and 1.
+    """
+    # create signed distance function
+    sd = skfmm.distance(data - 0.5, dx=1)
+
+    # interpolate zero-levelset onto 0.5-levelset
+    x = [np.min(sd.flatten()), 0, np.max(sd.flatten())]
+    y = [0, 0.5, 1]
+    f = interpolate.interp1d(x, y, kind="linear")
+
+    return f(sd)
diff --git a/python/adjoint/optimization_problem.py b/python/adjoint/optimization_problem.py
index 707eb8beb..a7899580e 100644
--- a/python/adjoint/optimization_problem.py
+++ b/python/adjoint/optimization_problem.py
@@ -226,7 +226,7 @@ def adjoint_run(self):
 
         # flip the m number
         if utils._check_if_cylindrical(self.sim):
-            self.sim.m = -self.sim.m
+            self.sim.change_m_number(-self.sim.m)
 
         # flip the k point
         if self.sim.k_point:
@@ -261,11 +261,11 @@ def adjoint_run(self):
 
         # reset the m number
         if utils._check_if_cylindrical(self.sim):
-            self.sim.m = -self.sim.m
+            self.sim.change_m_number(-self.sim.m)
 
         # reset the k point
         if self.sim.k_point:
-            self.sim.k_point *= -1
+            self.sim.change_k_point(-1 * self.sim.k_point)
 
         # update optimizer's state
         self.current_state = "ADJ"
diff --git a/python/adjoint/utils.py b/python/adjoint/utils.py
index fb37d2b0b..9683348e1 100644
--- a/python/adjoint/utils.py
+++ b/python/adjoint/utils.py
@@ -20,7 +20,7 @@
 _Z_AXIS = 1
 
 # default finite difference step size when calculating Aᵤ
-FD_DEFAULT = 1e-3
+FD_DEFAULT = 1e-6
 
 
 class DesignRegion:
diff --git a/python/meep.i b/python/meep.i
index 66a4703cf..0cae141dd 100644
--- a/python/meep.i
+++ b/python/meep.i
@@ -1504,8 +1504,6 @@ void _get_gradient(PyObject *grad, double scalegrad,
 %ignore material_type_equal;
 %ignore is_variable;
 %ignore is_variable;
-%ignore is_file;
-%ignore is_file;
 %ignore is_medium;
 %ignore is_medium;
 %ignore is_metal;
@@ -1986,7 +1984,7 @@ meep_geom::geom_epsilon* _set_materials(meep::structure * s,
         geps = existing_geps;
     } else {
         geps = meep_geom::make_geom_epsilon(s, &gobj_list, center, _ensure_periodicity, _default_material,
-                                                extra_materials);
+                                                extra_materials, use_anisotropic_averaging);
     }
     if (set_materials) {
         meep_geom::set_materials_from_geom_epsilon(s, geps, use_anisotropic_averaging, tol,
diff --git a/python/requirements.txt b/python/requirements.txt
index 772aac158..c43d34869 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,3 +7,4 @@ numpy
 parameterized
 pytest
 scipy
+scikit-fmm
diff --git a/python/simulation.py b/python/simulation.py
index a8afd9e3a..6c3e15d63 100644
--- a/python/simulation.py
+++ b/python/simulation.py
@@ -4279,6 +4279,23 @@ def change_k_point(self, k):
                         py_v3_to_vec(self.dimensions, self.k_point, self.is_cylindrical)
                     )
 
+    def change_m_number(self, m):
+        """
+        Change the `m` number (the angular ϕ dependence).
+        """
+        self.m = m
+
+        if self.fields:
+            needs_complex_fields = not (not self.m or self.m == 0)
+
+            if needs_complex_fields and self.fields.is_real:
+                self.fields = None
+                self._is_initialized = False
+                self.init_sim()
+            else:
+                if self.m is not None:
+                    self.fields.change_m_number(m)
+
     def change_sources(self, new_sources):
         """
         Change the list of sources in `Simulation.sources` to `new_sources`, and changes
diff --git a/python/tests/test_material_grid.py b/python/tests/test_material_grid.py
index 284b5c4ed..c40a973cf 100644
--- a/python/tests/test_material_grid.py
+++ b/python/tests/test_material_grid.py
@@ -1,4 +1,13 @@
+"""
+Checks that a material grid works with symmetries, and that the
+new subpixel smoothing feature is accurate.
+
+TODO:
+    Create a 3D test that works well with the new smoothing
+"""
+
 import meep as mp
+from meep import mpb
 
 try:
     import meep.adjoint as mpa
@@ -8,7 +17,12 @@
 import unittest
 
 import numpy as np
-from scipy.ndimage import gaussian_filter
+import unittest
+
+rad = 0.301943
+k_point = mp.Vector3(0.3892, 0.1597, 0)
+Si = mp.Medium(index=3.5)
+cell_size = mp.Vector3(1, 1, 0)
 
 
 def compute_transmittance(matgrid_symmetry=False):
@@ -29,14 +43,17 @@ def compute_transmittance(matgrid_symmetry=False):
     rng = np.random.RandomState(2069588)
 
     w = rng.rand(Nx, Ny)
-    weights = w if matgrid_symmetry else 0.5 * (w + np.fliplr(w))
+    # for subpixel smoothing, the underlying design grid must be smoothly varying
+    w = mpa.tanh_projection(rng.rand(Nx, Ny), 1e5, 0.5)
+    w = mpa.conic_filter(w, 0.25, matgrid_size.x, matgrid_size.y, matgrid_resolution)
+    weights = 0.5 * (w + np.fliplr(w)) if not matgrid_symmetry else w
 
     matgrid = mp.MaterialGrid(
         mp.Vector3(Nx, Ny),
         mp.air,
         mp.Medium(index=3.5),
         weights=weights,
-        do_averaging=False,
+        beta=np.inf,
         grid_type="U_MEAN",
     )
 
@@ -95,16 +112,19 @@ def compute_transmittance(matgrid_symmetry=False):
     ][0]
 
     tran = np.power(np.abs(mode_coeff), 2)
-    print(f'tran:, {"sym" if matgrid_symmetry else "nosym"}, {tran}')
+    print("tran:, {}, {}".format("sym" if matgrid_symmetry else "nosym", tran))
 
     return tran
 
 
-def compute_resonant_mode_2d(res, default_mat=False):
-    cell_size = mp.Vector3(1, 1, 0)
-
-    rad = 0.301943
-
+def compute_resonant_mode_2d(
+    res,
+    radius=rad,
+    default_mat=False,
+    cylinder=False,
+    cylinder_matgrid=True,
+    do_averaging=True,
+):
     fcen = 0.3
     df = 0.2 * fcen
     sources = [
@@ -115,8 +135,6 @@ def compute_resonant_mode_2d(res, default_mat=False):
         )
     ]
 
-    k_point = mp.Vector3(0.3892, 0.1597, 0)
-
     matgrid_size = mp.Vector3(1, 1, 0)
     matgrid_resolution = 1200
 
@@ -129,38 +147,46 @@ def compute_resonant_mode_2d(res, default_mat=False):
     x = np.linspace(-0.5 * matgrid_size.x, 0.5 * matgrid_size.x, Nx)
     y = np.linspace(-0.5 * matgrid_size.y, 0.5 * matgrid_size.y, Ny)
     xv, yv = np.meshgrid(x, y)
-    weights = np.sqrt(np.square(xv) + np.square(yv)) < rad
-    filtered_weights = gaussian_filter(weights, sigma=3.0, output=np.double)
+    weights = mpa.make_sdf(np.sqrt(np.square(xv) + np.square(yv)) < radius)
+    if cylinder:
+        weights = 0.0 * weights + 1.0
 
     matgrid = mp.MaterialGrid(
-        mp.Vector3(Nx, Ny),
-        mp.air,
-        mp.Medium(index=3.5),
-        weights=filtered_weights,
-        do_averaging=True,
-        beta=1000,
-        eta=0.5,
+        mp.Vector3(Nx, Ny), mp.air, Si, weights=weights, beta=np.inf, eta=0.5
     )
 
-    geometry = [
-        mp.Block(
-            center=mp.Vector3(),
-            size=mp.Vector3(matgrid_size.x, matgrid_size.y, 0),
-            material=matgrid,
-        )
-    ]
+    # Use a cylinder object, not a block object
+    if cylinder:
+        # within the cylinder, use a (uniform) material grid
+        if cylinder_matgrid:
+            geometry = [
+                mp.Cylinder(center=mp.Vector3(), radius=radius, material=matgrid)
+            ]
+        # within the cylinder, just use a normal medium
+        else:
+            geometry = [mp.Cylinder(center=mp.Vector3(), radius=radius, material=Si)]
+    # use a block object
+    else:
+        geometry = [
+            mp.Block(
+                center=mp.Vector3(),
+                size=mp.Vector3(matgrid_size.x, matgrid_size.y, 0),
+                material=matgrid,
+            )
+        ]
 
     sim = mp.Simulation(
         resolution=res,
+        eps_averaging=do_averaging,
         cell_size=cell_size,
         default_material=matgrid if default_mat else mp.Medium(),
-        geometry=[] if default_mat else geometry,
+        geometry=geometry if not default_mat else [],
         sources=sources,
         k_point=k_point,
     )
 
     h = mp.Harminv(mp.Hz, mp.Vector3(0.3718, -0.2076), fcen, df)
-    sim.run(mp.after_sources(h), until_after_sources=200)
+    sim.run(mp.after_sources(h), until_after_sources=300)
 
     try:
         for m in h.modes:
@@ -172,94 +198,81 @@ def compute_resonant_mode_2d(res, default_mat=False):
     return freq
 
 
-def compute_resonant_mode_3d(use_matgrid=True):
-    resolution = 25
-
-    wvl = 1.27
-    fcen = 1 / wvl
-    df = 0.02 * fcen
-
-    nSi = 3.45
-    Si = mp.Medium(index=nSi)
-    nSiO2 = 1.45
-    SiO2 = mp.Medium(index=nSiO2)
-
-    s = 1.0
-    cell_size = mp.Vector3(s, s, s)
-
-    rad = 0.34  # radius of sphere
-
-    if use_matgrid:
-        matgrid_resolution = 2 * resolution
-        N = int(s * matgrid_resolution)
-        coord = np.linspace(-0.5 * s, 0.5 * s, N)
-        xv, yv, zv = np.meshgrid(coord, coord, coord)
-
-        weights = np.sqrt(np.square(xv) + np.square(yv) + np.square(zv)) < rad
-        filtered_weights = gaussian_filter(
-            weights, sigma=4 / resolution, output=np.double
-        )
-
-        matgrid = mp.MaterialGrid(
-            mp.Vector3(N, N, N),
-            SiO2,
-            Si,
-            weights=filtered_weights,
-            do_averaging=True,
-            beta=1000,
-            eta=0.5,
-        )
-
-        geometry = [mp.Block(center=mp.Vector3(), size=cell_size, material=matgrid)]
-    else:
-        geometry = [mp.Sphere(center=mp.Vector3(), radius=rad, material=Si)]
-
-    sources = [
-        mp.Source(
-            src=mp.GaussianSource(fcen, fwidth=df),
-            size=mp.Vector3(),
-            center=mp.Vector3(0.13, 0.25, 0.06),
-            component=mp.Ez,
-        )
-    ]
-
-    k_point = mp.Vector3(0.23, -0.17, 0.35)
+def compute_resonant_mode_mpb(resolution=512):
+    geometry = [mp.Cylinder(rad, material=Si)]
+    geometry_lattice = mp.Lattice(cell_size)
 
-    sim = mp.Simulation(
-        resolution=resolution,
-        cell_size=cell_size,
-        sources=sources,
-        default_material=SiO2,
-        k_point=k_point,
+    ms = mpb.ModeSolver(
+        num_bands=1,
+        k_points=[mp.cartesian_to_reciprocal(k_point, geometry_lattice)],
         geometry=geometry,
+        geometry_lattice=geometry_lattice,
+        resolution=resolution,
     )
 
-    h = mp.Harminv(mp.Ez, mp.Vector3(-0.2684, 0.1185, 0.0187), fcen, df)
-
-    sim.run(mp.after_sources(h), until_after_sources=200)
-
-    try:
-        for m in h.modes:
-            print(f"harminv:, {resolution}, {m.freq}, {m.Q}")
-        freq = h.modes[0].freq
-    except:
-        raise RuntimeError("No resonant modes found.")
-
-    return freq
+    ms.run_te()
+    return ms.freqs[0]
 
 
 class TestMaterialGrid(unittest.TestCase):
     def test_subpixel_smoothing(self):
-        # "exact" frequency computed using MaterialGrid at resolution = 300
-        freq_ref = 0.29826813873225283
-
-        res = [25, 50]
+        res = 25
+
+        def subpixel_test_matrix(radius, do_averaging):
+            freq_cylinder = compute_resonant_mode_2d(
+                res,
+                radius,
+                default_mat=False,
+                cylinder=True,
+                cylinder_matgrid=False,
+                do_averaging=do_averaging,
+            )
+            freq_matgrid = compute_resonant_mode_2d(
+                res,
+                radius,
+                default_mat=False,
+                cylinder=False,
+                do_averaging=do_averaging,
+            )
+            freq_matgrid_cylinder = compute_resonant_mode_2d(
+                res,
+                radius,
+                default_mat=False,
+                cylinder=True,
+                cylinder_matgrid=True,
+                do_averaging=do_averaging,
+            )
+            return [freq_cylinder, freq_matgrid, freq_matgrid_cylinder]
+
+        # when smoothing is off, all three tests should be identical to machine precision
+        no_smoothing = subpixel_test_matrix(rad, False)
+        self.assertEqual(no_smoothing[0], no_smoothing[1])
+        self.assertEqual(no_smoothing[1], no_smoothing[2])
+
+        # when we slightly perturb the radius, the results should be the same as before.
+        no_smoothing_perturbed = subpixel_test_matrix(rad + 0.01 / res, False)
+        self.assertEqual(no_smoothing, no_smoothing_perturbed)
+
+        # when smoothing is on, the simple material results should be different from the matgrid results
+        smoothing = subpixel_test_matrix(rad, True)
+        self.assertNotEqual(smoothing[0], smoothing[1])
+        self.assertAlmostEqual(smoothing[1], smoothing[2], 3)
+
+        # when we slighty perturb the radius, the results should all be different from before.
+        smoothing_perturbed = subpixel_test_matrix(rad + 0.01 / res, True)
+        self.assertNotEqual(smoothing, smoothing_perturbed)
+
+        # "exact" frequency computed using MPB
+        freq_ref = compute_resonant_mode_mpb()
+        print(freq_ref)
+        res = [50, 100]
         freq_matgrid = []
         for r in res:
-            freq_matgrid.append(compute_resonant_mode_2d(r))
+            freq_matgrid.append(compute_resonant_mode_2d(r, cylinder=False))
             # verify that the frequency of the resonant mode is
             # approximately equal to the reference value
             self.assertAlmostEqual(freq_ref, freq_matgrid[-1], 2)
+        print("results:    ", freq_ref, freq_matgrid)
 
         # verify that the relative error is decreasing with increasing resolution
         # and is better than linear convergence because of subpixel smoothing
@@ -268,14 +281,10 @@ def test_subpixel_smoothing(self):
             abs(freq_matgrid[0] - freq_ref),
         )
 
-        freq_matgrid_default_mat = compute_resonant_mode_2d(res[0], True)
+        # ensure that a material grid as a default material works
+        freq_matgrid_default_mat = compute_resonant_mode_2d(res[0], rad, True)
         self.assertAlmostEqual(freq_matgrid[0], freq_matgrid_default_mat)
 
-    def test_matgrid_3d(self):
-        freq_matgrid = compute_resonant_mode_3d(True)
-        freq_geomobj = compute_resonant_mode_3d(False)
-        self.assertAlmostEqual(freq_matgrid, freq_geomobj, places=2)
-
     def test_symmetry(self):
         tran_nosym = compute_transmittance(False)
         tran_sym = compute_transmittance(True)
diff --git a/python/typemap_utils.cpp b/python/typemap_utils.cpp
index ab8c8a294..0ad65c6bf 100644
--- a/python/typemap_utils.cpp
+++ b/python/typemap_utils.cpp
@@ -522,8 +522,16 @@ static int pymaterial_grid_to_material_grid(PyObject *po, material_data *md) {
   PyArrayObject *pao = (PyArrayObject *)po_dp;
   if (!PyArray_Check(pao)) { meep::abort("MaterialGrid weights failed to init."); }
   if (!PyArray_ISCARRAY(pao)) { meep::abort("Numpy array weights must be C-style contiguous."); }
-  md->weights = new double[PyArray_SIZE(pao)];
-  memcpy(md->weights, (double *)PyArray_DATA(pao), PyArray_SIZE(pao) * sizeof(double));
+  md->weights.clear();
+  for (size_t i = 0; i < PyArray_SIZE(pao); i++) {
+    md->weights.push_back(((double *)PyArray_DATA(pao))[i]);
+  }
+  md->weights.clear();
+  for (size_t i = 0; i < PyArray_SIZE(pao); i++) {
+    md->weights.push_back(((double *)PyArray_DATA(pao))[i]);
+  }
+  // md->weights = new double[PyArray_SIZE(pao)];
+  // memcpy(md->weights, (double *)PyArray_DATA(pao), PyArray_SIZE(pao) * sizeof(double));
 
   // if needed, combine sus structs to main object
   PyObject *py_e_sus_m1 = PyObject_GetAttrString(po_medium1, "E_susceptibilities");
diff --git a/src/fields.cpp b/src/fields.cpp
index b9f43f338..6d16eb748 100644
--- a/src/fields.cpp
+++ b/src/fields.cpp
@@ -793,4 +793,13 @@ bool operator==(const comms_key &lhs, const comms_key &rhs) {
   return (lhs.ft == rhs.ft) && (lhs.phase == rhs.phase) && (lhs.pair == rhs.pair);
 }
 
+void fields::change_m_number(double new_m) {
+  m = new_m;
+  for (int i = 0; i < num_chunks; i++) {
+    chunks[i]->change_m_number(new_m);
+  }
+}
+
+void fields_chunk::change_m_number(double new_m) { m = new_m; }
+
 } // namespace meep
diff --git a/src/material_data.cpp b/src/material_data.cpp
index a24d87cee..7848aae83 100644
--- a/src/material_data.cpp
+++ b/src/material_data.cpp
@@ -66,10 +66,10 @@ void material_data::copy_from(const material_data &from) {
   }
 
   grid_size = from.grid_size;
-  if (from.weights) {
+  if (from.weights.size() != 0) {
     size_t N = from.grid_size.x * from.grid_size.y * from.grid_size.z;
-    weights = new double[N];
-    std::copy_n(from.weights, N, weights);
+    weights.clear();
+    weights.insert(weights.end(), from.weights.begin(), from.weights.end());
   }
 
   medium_1 = from.medium_1;
diff --git a/src/material_data.hpp b/src/material_data.hpp
index 4e1df20d5..ac6d57e24 100644
--- a/src/material_data.hpp
+++ b/src/material_data.hpp
@@ -25,6 +25,7 @@
 
 #include <meep.hpp>
 #include <ctlgeom.h>
+#include "support/dual.hpp"
 
 #include "meep.hpp"
 
@@ -122,6 +123,9 @@ struct material_data {
   // these fields used only if which_subclass==MATERIAL_USER
   user_material_func user_func;
   void *user_data;
+
+  // used for any variable material (USER, FILE, or GRID):
+  // indicates whether we should use the fallback subpixel averaging via quadrature
   bool do_averaging;
 
   // these fields used only if which_subclass==MATERIAL_FILE
@@ -130,7 +134,7 @@ struct material_data {
 
   // these fields used only if which_subclass==MATERIAL_GRID
   vector3 grid_size;
-  double *weights;
+  std::vector<duals::duald> weights;
   medium_struct medium_1;
   medium_struct medium_2;
   double beta;
@@ -184,7 +188,7 @@ material_type make_user_material(user_material_func user_func, void *user_data);
 material_type make_file_material(char *epsilon_input_file);
 material_type make_material_grid(bool do_averaging, double beta, double eta, double damping);
 void read_epsilon_file(const char *eps_input_file);
-void update_weights(material_type matgrid, double *weights);
+void update_weights(material_type matgrid, duals::duald *weights);
 
 }; // namespace meep_geom
 
diff --git a/src/meep.hpp b/src/meep.hpp
index f5a721e81..9b82072a2 100644
--- a/src/meep.hpp
+++ b/src/meep.hpp
@@ -1546,6 +1546,7 @@ class fields_chunk {
   void remove_sources();
   void remove_susceptibilities(bool shared_chunks);
   void zero_fields();
+  void change_m_number(double new_m);
 
   // update_eh.cpp
   bool needs_W_prev(component c) const;
@@ -1751,6 +1752,7 @@ class fields {
   void remove_fluxes();
   void reset();
   void log(const char *prefix = "");
+  void change_m_number(double new_m);
 
   // time.cpp
   std::vector<double> time_spent_on(time_sink sink);
diff --git a/src/meepgeom.cpp b/src/meepgeom.cpp
index 6b83d109b..c254a9b96 100644
--- a/src/meepgeom.cpp
+++ b/src/meepgeom.cpp
@@ -14,11 +14,15 @@
 %  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */
 
+#include <iostream>
 #include <algorithm>
 #include <vector>
 #include "meepgeom.hpp"
 #include "meep_internals.hpp"
 
+namespace dlit = duals::literals;
+using namespace duals::literals;
+
 namespace meep_geom {
 
 #define master_printf meep::master_printf
@@ -121,9 +125,6 @@ void material_free(material_type m) {
   // object so will assume that the caller keeps track of its lifetime.
   delete[] m->epsilon_data;
   m->epsilon_data = NULL;
-
-  delete[] m->weights;
-  m->weights = NULL;
   delete m;
 }
 
@@ -148,9 +149,9 @@ bool material_type_equal(const material_type m1, const material_type m2) {
 /* rotate A by a unitary (real) rotation matrix R:
       RAR = transpose(R) * A * R
 */
-void sym_matrix_rotate(symm_matrix *RAR, const symm_matrix *A_, const double R[3][3]) {
+void sym_matrix_rotate(symm_matrix *RAR, const symm_matrix *A_, const duals::duald R[3][3]) {
   int i, j;
-  double A[3][3], AR[3][3];
+  duals::duald A[3][3], AR[3][3];
   A[0][0] = A_->m00;
   A[1][1] = A_->m11;
   A[2][2] = A_->m22;
@@ -173,8 +174,8 @@ void sym_matrix_rotate(symm_matrix *RAR, const symm_matrix *A_, const double R[3
 
 /* Set Vinv = inverse of V, where both V and Vinv are real-symmetric matrices.*/
 void sym_matrix_invert(symm_matrix *Vinv, const symm_matrix *V) {
-  double m00 = V->m00, m11 = V->m11, m22 = V->m22;
-  double m01 = V->m01, m02 = V->m02, m12 = V->m12;
+  duals::duald m00 = V->m00, m11 = V->m11, m22 = V->m22;
+  duals::duald m01 = V->m01, m02 = V->m02, m12 = V->m12;
 
   if (m01 == 0.0 && m02 == 0.0 && m12 == 0.0) {
     /* optimize common case of a diagonal matrix: */
@@ -184,13 +185,13 @@ void sym_matrix_invert(symm_matrix *Vinv, const symm_matrix *V) {
     Vinv->m01 = Vinv->m02 = Vinv->m12 = 0.0;
   }
   else {
-    double detinv;
+    duals::duald detinv;
 
     /* compute the determinant: */
     detinv = m00 * m11 * m22 - m02 * m11 * m02 + 2.0 * m01 * m12 * m02 - m01 * m01 * m22 -
              m12 * m12 * m00;
 
-    if (detinv == 0.0) meep::abort("singular 3x3 matrix");
+    if (detinv.rpart() == 0.0) meep::abort("singular 3x3 matrix");
 
     detinv = 1.0 / detinv;
 
@@ -206,18 +207,19 @@ void sym_matrix_invert(symm_matrix *Vinv, const symm_matrix *V) {
 
 /* Returns whether or not V is positive-definite. */
 int sym_matrix_positive_definite(symm_matrix *V) {
-  double det2, det3;
-  double m00 = V->m00, m11 = V->m11, m22 = V->m22;
+  duals::duald det2, det3;
+  duals::duald m00 = V->m00, m11 = V->m11, m22 = V->m22;
 
 #if defined(WITH_HERMITIAN_EPSILON)
-  scalar_complex m01 = V->m01, m02 = V->m02, m12 = V->m12;
+  // FIXME
+  scalar_complex m01 = V->m01.rpart(), m02 = V->m02.rpart(), m12 = V->m12.rpart();
 
   det2 = m00 * m11 - CSCALAR_NORMSQR(m01);
   det3 = det2 * m22 - m11 * CSCALAR_NORMSQR(m02) - CSCALAR_NORMSQR(m12) * m00 +
          2.0 * ((m01.re * m12.re - m01.im * m12.im) * m02.re +
                 (m01.re * m12.im + m01.im * m12.re) * m02.im);
 #else  /* real matrix */
-  double m01 = V->m01, m02 = V->m02, m12 = V->m12;
+  duals::duald m01 = V->m01, m02 = V->m02, m12 = V->m12;
 
   det2 = m00 * m11 - m01 * m01;
   det3 = det2 * m22 - m02 * m11 * m02 + 2.0 * m01 * m12 * m02 - m12 * m12 * m00;
@@ -285,15 +287,14 @@ bool is_material_grid(material_type mt) {
 }
 bool is_material_grid(void *md) { return is_material_grid((material_type)md); }
 
+// return whether mt is spatially varying
 bool is_variable(material_type mt) {
-  return (mt->which_subclass == material_data::MATERIAL_USER) ||
-         (mt->which_subclass == material_data::MATERIAL_GRID);
+  return (mt && ((mt->which_subclass == material_data::MATERIAL_USER) ||
+                 (mt->which_subclass == material_data::MATERIAL_GRID) ||
+                 (mt->which_subclass == material_data::MATERIAL_FILE)));
 }
 bool is_variable(void *md) { return is_variable((material_type)md); }
 
-bool is_file(material_type md) { return (md->which_subclass == material_data::MATERIAL_FILE); }
-bool is_file(void *md) { return is_file((material_type)md); }
-
 bool is_medium(material_type md, medium_struct **m) {
   if (md->which_subclass == material_data::MEDIUM) {
     *m = &(md->medium);
@@ -304,24 +305,29 @@ bool is_medium(material_type md, medium_struct **m) {
 
 bool is_medium(void *md, medium_struct **m) { return is_medium((material_type)md, m); }
 
+// note: is_metal assumes that eval_material_pt has already been called for variable materials
 bool is_metal(meep::field_type ft, const material_type *material) {
   material_data *md = *material;
   if (ft == meep::E_stuff) switch (md->which_subclass) {
       case material_data::MEDIUM:
+      case material_data::MATERIAL_USER:
+      case material_data::MATERIAL_FILE:
       case material_data::MATERIAL_GRID:
         return (md->medium.epsilon_diag.x < 0 || md->medium.epsilon_diag.y < 0 ||
                 md->medium.epsilon_diag.z < 0);
       case material_data::PERFECT_METAL: return true;
-      default: meep::abort("unknown material type"); return false;
+      default: meep::abort("unknown material type in is_metal"); return false;
     }
   else
     switch (md->which_subclass) {
       case material_data::MEDIUM:
+      case material_data::MATERIAL_USER:
+      case material_data::MATERIAL_FILE:
       case material_data::MATERIAL_GRID:
         return (md->medium.mu_diag.x < 0 || md->medium.mu_diag.y < 0 || md->medium.mu_diag.z < 0);
       case material_data::PERFECT_METAL:
         return false; // is an electric conductor, but not a magnetic conductor
-      default: meep::abort("unknown material type"); return false;
+      default: meep::abort("unknown material type in is_metal"); return false;
     }
 }
 
@@ -337,37 +343,60 @@ bool has_offdiag(const medium_struct *material) {
 
 // computes the vector-Jacobian product of the gradient of the matgrid_val function v
 // with the Jacobian of the to_geom_box_coords function for geometric_object o
-vector3 to_geom_object_coords_VJP(vector3 v, const geometric_object *o) {
+cvector3 to_geom_object_coords_VJP(cvector3 v, const geometric_object *o) {
   if (!o) { meep::abort("must pass a geometric_object to to_geom_object_coords_VJP.\n"); }
 
   switch (o->which_subclass) {
     default: {
-      vector3 po = {0, 0, 0};
+      cvector3 po = cvector_zero();
       return po;
     }
     case geometric_object::SPHERE: {
       number radius = o->subclass.sphere_data->radius;
-      return vector3_scale(0.5 / radius, v);
+      return cvector3_scale(0.5 / radius, v);
     }
     /* case geometric_object::CYLINDER:
        NOT YET IMPLEMENTED */
     case geometric_object::BLOCK: {
+      /* In order to leverage the underlying libctl infrastructure
+      *and* the dual library that makes computing derivatives so easy,
+      me perform a bit of a trick: we use the *complex* libctl vector,
+      storing the real and dual parts of the dual library and *manually*
+      propagate the dual through existing libraries as needed.
+      */
+      vector3 v_real = cvector3_re(v); // real part
+      vector3 v_imag = cvector3_im(v); // "dual" part
+
       vector3 size = o->subclass.block_data->size;
-      if (size.x != 0.0) v.x /= size.x;
-      if (size.y != 0.0) v.y /= size.y;
-      if (size.z != 0.0) v.z /= size.z;
-      return matrix3x3_transpose_vector3_mult(o->subclass.block_data->projection_matrix, v);
+
+      if (size.x != 0.0) {
+        v_real.x /= size.x;
+        v_imag.x /= size.x;
+      }
+      if (size.y != 0.0) {
+        v_real.y /= size.y;
+        v_imag.y /= size.y;
+      }
+      if (size.z != 0.0) {
+        v_real.z /= size.z;
+        v_imag.z /= size.z;
+      }
+      v_real = matrix3x3_transpose_vector3_mult(o->subclass.block_data->projection_matrix, v_real);
+      v_imag = matrix3x3_transpose_vector3_mult(o->subclass.block_data->projection_matrix, v_imag);
+      return make_cvector3(v_real, v_imag);
     }
       /* case geometric_object::PRISM:
          NOT YET IMPLEMENTED */
   }
 }
 
-meep::vec material_grid_grad(vector3 p, material_data *md, const geometric_object *o) {
+cvector3 material_grid_grad(vector3 p, material_data *md, const geometric_object *o) {
+  /* computes the actual spatial gradient at point `p`
+  for the specified material grid `md`. */
+
   if (!is_material_grid(md)) { meep::abort("Invalid material grid detected.\n"); }
 
-  meep::vec gradient(zero_vec(dim));
-  double *data = md->weights;
+  cvector3 gradient = cvector_zero();
   int nx = md->grid_size.x;
   int ny = md->grid_size.y;
   int nz = md->grid_size.z;
@@ -397,25 +426,26 @@ meep::vec material_grid_grad(vector3 p, material_data *md, const geometric_objec
 
   /* define a macro to give us data(x,y,z) on the grid,
      in row-major order: */
-#define D(x, y, z) (data[(((x)*ny + (y)) * nz + (z)) * stride])
+#define D(x, y, z) ((md->weights)[(((x)*ny + (y)) * nz + (z)) * stride])
 
-  double du_dx =
+  duals::duald du_dx =
       (signflip_dx ? -1.0 : 1.0) *
       (((-D(x1, y1, z1) + D(x2, y1, z1)) * (1.0 - dy) + (-D(x1, y2, z1) + D(x2, y2, z1)) * dy) *
            (1.0 - dz) +
        ((-D(x1, y1, z2) + D(x2, y1, z2)) * (1.0 - dy) + (-D(x1, y2, z2) + D(x2, y2, z2)) * dy) *
            dz);
-  double du_dy = (signflip_dy ? -1.0 : 1.0) * ((-(D(x1, y1, z1) * (1.0 - dx) + D(x2, y1, z1) * dx) +
-                                                (D(x1, y2, z1) * (1.0 - dx) + D(x2, y2, z1) * dx)) *
-                                                   (1.0 - dz) +
-                                               (-(D(x1, y1, z2) * (1.0 - dx) + D(x2, y1, z2) * dx) +
-                                                (D(x1, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx)) *
-                                                   dz);
-  double du_dz = (signflip_dz ? -1.0 : 1.0) *
-                 (-((D(x1, y1, z1) * (1.0 - dx) + D(x2, y1, z1) * dx) * (1.0 - dy) +
-                    (D(x1, y2, z1) * (1.0 - dx) + D(x2, y2, z1) * dx) * dy) +
-                  ((D(x1, y1, z2) * (1.0 - dx) + D(x2, y1, z2) * dx) * (1.0 - dy) +
-                   (D(x1, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx) * dy));
+  duals::duald du_dy =
+      (signflip_dy ? -1.0 : 1.0) * ((-(D(x1, y1, z1) * (1.0 - dx) + D(x2, y1, z1) * dx) +
+                                     (D(x1, y2, z1) * (1.0 - dx) + D(x2, y2, z1) * dx)) *
+                                        (1.0 - dz) +
+                                    (-(D(x1, y1, z2) * (1.0 - dx) + D(x2, y1, z2) * dx) +
+                                     (D(x1, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx)) *
+                                        dz);
+  duals::duald du_dz = (signflip_dz ? -1.0 : 1.0) *
+                       (-((D(x1, y1, z1) * (1.0 - dx) + D(x2, y1, z1) * dx) * (1.0 - dy) +
+                          (D(x1, y2, z1) * (1.0 - dx) + D(x2, y2, z1) * dx) * dy) +
+                        ((D(x1, y1, z2) * (1.0 - dx) + D(x2, y1, z2) * dx) * (1.0 - dy) +
+                         (D(x1, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx) * dy));
 
 #undef D
 
@@ -424,25 +454,22 @@ meep::vec material_grid_grad(vector3 p, material_data *md, const geometric_objec
   // respect to r2 where g(r2) is the to_geom_object_coords function (in libctl/utils/geom.c).
   // computing this quantity involves using the chain rule and thus the vector-Jacobian product
   // ∇u J where J is the Jacobian matrix of g.
-  vector3 grad_u;
-  grad_u.x = du_dx * nx;
-  grad_u.y = du_dy * ny;
-  grad_u.z = du_dz * nz;
-  if (o != NULL) {
-    vector3 grad_u_J = to_geom_object_coords_VJP(grad_u, o);
-    gradient.set_direction(meep::X, grad_u_J.x);
-    gradient.set_direction(meep::Y, grad_u_J.y);
-    gradient.set_direction(meep::Z, grad_u_J.z);
-  }
+  vector3 g_real = make_vector3(nx * du_dx.rpart(), ny * du_dy.rpart(), nz * du_dz.rpart());
+  vector3 g_imag = make_vector3(nx * du_dx.dpart(), ny * du_dy.dpart(), nz * du_dz.dpart());
+  cvector3 grad_u = make_cvector3(g_real, g_imag);
+
+  if (o != NULL) { gradient = to_geom_object_coords_VJP(grad_u, o); }
   else {
-    gradient.set_direction(meep::X,
-                           geometry_lattice.size.x == 0 ? 0 : grad_u.x / geometry_lattice.size.x);
-    gradient.set_direction(meep::Y,
-                           geometry_lattice.size.y == 0 ? 0 : grad_u.y / geometry_lattice.size.y);
-    gradient.set_direction(meep::Z,
-                           geometry_lattice.size.z == 0 ? 0 : grad_u.z / geometry_lattice.size.z);
+    g_real = make_vector3(
+        geometry_lattice.size.x == 0 ? 0 : nx * du_dx.rpart() / geometry_lattice.size.x,
+        geometry_lattice.size.y == 0 ? 0 : ny * du_dy.rpart() / geometry_lattice.size.y,
+        geometry_lattice.size.z == 0 ? 0 : nz * du_dz.rpart() / geometry_lattice.size.z);
+    g_imag = make_vector3(
+        geometry_lattice.size.x == 0 ? 0 : nx * du_dx.dpart() / geometry_lattice.size.x,
+        geometry_lattice.size.y == 0 ? 0 : ny * du_dy.dpart() / geometry_lattice.size.y,
+        geometry_lattice.size.z == 0 ? 0 : nz * du_dz.dpart() / geometry_lattice.size.z);
+    gradient = make_cvector3(g_real, g_imag);
   }
-
   return gradient;
 }
 
@@ -452,21 +479,30 @@ void map_lattice_coordinates(double &px, double &py, double &pz) {
   pz = geometry_lattice.size.z == 0 ? 0 : 0.5 + (pz - geometry_center.z) / geometry_lattice.size.z;
 }
 
-meep::vec matgrid_grad(vector3 p, geom_box_tree tp, int oi, material_data *md) {
+cvector3 matgrid_grad(vector3 p, geom_box_tree tp, int oi, material_data *md) {
+  /* loops through all the material grids at a current point
+  and computes the final *spatial* gradient (w.r.t. x,y,z)
+  after all appropriate transformations (e.g. due to
+  overlapping grids). Calls the helper function, `material_grid_grad`,
+  which is what actually computes the spatial gradient.
+  */
+
+  // check for proper overlapping grids
   if (md->material_grid_kinds == material_data::U_MIN ||
       md->material_grid_kinds == material_data::U_PROD)
     meep::abort("%s:%i:matgrid_grad does not support overlapping grids with U_MIN or U_PROD\n",
                 __FILE__, __LINE__);
 
-  meep::vec gradient(zero_vec(dim));
+  cvector3 gradient = cvector_zero();
   int matgrid_val_count = 0;
 
   // iterate through object tree at current point
   if (tp) {
     do {
-      gradient +=
-          material_grid_grad(to_geom_box_coords(p, &tp->objects[oi]),
-                             (material_data *)tp->objects[oi].o->material, tp->objects[oi].o);
+      gradient =
+          cvector_add(gradient, material_grid_grad(to_geom_box_coords(p, &tp->objects[oi]),
+                                                   (material_data *)tp->objects[oi].o->material,
+                                                   tp->objects[oi].o));
       if (md->material_grid_kinds == material_data::U_DEFAULT) break;
       ++matgrid_val_count;
       tp = geom_tree_search_next(p, tp, &oi);
@@ -480,29 +516,51 @@ meep::vec matgrid_grad(vector3 p, geom_box_tree tp, int oi, material_data *md) {
     ++matgrid_val_count;
   }
 
+  // compensate for overlapping grids
   if (md->material_grid_kinds == material_data::U_MEAN)
-    gradient = gradient * 1.0 / matgrid_val_count;
+    gradient = cvector3_scale(1.0 / matgrid_val_count, gradient);
 
   return gradient;
 }
+duals::duald dual_linear_interpolate(double rx, double ry, double rz,
+                                     std::vector<duals::duald> &data, int nx, int ny, int nz,
+                                     int stride) {
+
+  int x1, y1, z1, x2, y2, z2;
+  double dx, dy, dz;
+
+  meep::map_coordinates(rx, ry, rz, nx, ny, nz, x1, y1, z1, x2, y2, z2, dx, dy, dz);
+
+  /* define a macro to give us data(x,y,z) on the grid,
+     in row-major order (the order used by HDF5): */
+#define D(x, y, z) (data[(((x)*ny + (y)) * nz + (z)) * stride])
+
+  return (((D(x1, y1, z1) * (1.0 - dx) + D(x2, y1, z1) * dx) * (1.0 - dy) +
+           (D(x1, y2, z1) * (1.0 - dx) + D(x2, y2, z1) * dx) * dy) *
+              (1.0 - dz) +
+          ((D(x1, y1, z2) * (1.0 - dx) + D(x2, y1, z2) * dx) * (1.0 - dy) +
+           (D(x1, y2, z2) * (1.0 - dx) + D(x2, y2, z2) * dx) * dy) *
+              dz);
 
-double material_grid_val(vector3 p, material_data *md) {
+#undef D
+}
+duals::duald material_grid_val(vector3 p, material_data *md) {
   // given the relative location, p, interpolate the material grid point.
 
-  if (!is_material_grid(md)) { meep::abort("Invalid material grid detected.\n"); }
-  return meep::linear_interpolate(p.x, p.y, p.z, md->weights, md->grid_size.x, md->grid_size.y,
-                                  md->grid_size.z, 1);
+  if (!is_material_grid(md)) { abort(); }
+  return dual_linear_interpolate(p.x, p.y, p.z, md->weights, md->grid_size.x, md->grid_size.y,
+                                 md->grid_size.z, 1);
 }
 
-static double tanh_projection(double u, double beta, double eta) {
+static duals::duald tanh_projection(duals::duald u, double beta, double eta) {
   if (beta == 0) return u;
   if (u == eta) return 0.5; // avoid NaN when beta is Inf
-  double tanh_beta_eta = tanh(beta * eta);
+  duals::duald tanh_beta_eta = tanh(beta * eta);
   return (tanh_beta_eta + tanh(beta * (u - eta))) / (tanh_beta_eta + tanh(beta * (1 - eta)));
 }
 
-double matgrid_val(vector3 p, geom_box_tree tp, int oi, material_data *md) {
-  double uprod = 1.0, umin = 1.0, usum = 0.0, udefault = 0.0, u;
+duals::duald matgrid_val(vector3 p, geom_box_tree tp, int oi, material_data *md) {
+  duals::duald uprod = 1.0, umin = 1.0, usum = 0.0, udefault = 0.0, u;
   int matgrid_val_count = 0;
 
   // iterate through object tree at current point
@@ -569,7 +627,7 @@ static void interp_tensors(vector3 diag_in_1, vector3 offdiag_in_1, vector3 diag
 void epsilon_material_grid(material_data *md, double u) {
   // NOTE: assume p lies on normalized grid within (0,1)
 
-  if (!(md->weights)) meep::abort("material params were not initialized!");
+  if (md->weights.size() == 0) meep::abort("material params were not initialized!");
 
   medium_struct *mm = &(md->medium);
   medium_struct *m1 = &(md->medium_1);
@@ -807,7 +865,7 @@ static void material_epsmu(meep::field_type ft, material_type material, symm_mat
         epsmu_inv->m01 = epsmu_inv->m02 = epsmu_inv->m12 = 0.0;
         break;
 
-      default: meep::abort("unknown material type");
+      default: meep::abort("unknown material type in epsmu");
     }
   else
     switch (md->which_subclass) {
@@ -835,7 +893,7 @@ static void material_epsmu(meep::field_type ft, material_type material, symm_mat
         epsmu_inv->m01 = epsmu_inv->m02 = epsmu_inv->m12 = 0.0;
         break;
 
-      default: meep::abort("unknown material type");
+      default: meep::abort("unknown material type in epsmu");
     }
 }
 
@@ -847,26 +905,31 @@ void geom_epsilon::get_material_pt(material_type &material, const meep::vec &r)
   boolean inobject;
   material =
       (material_type)material_of_unshifted_point_in_tree_inobject(p, restricted_tree, &inobject);
-  material_data *md = material;
+  eval_material_pt(material, p);
+}
 
-  switch (md->which_subclass) {
+// evaluate the material at the given point p if necessary — this is needed if
+// the material is variable (a grid, function, or file); otherwise it is a no-op.
+void geom_epsilon::eval_material_pt(material_type &material, vector3 p) {
+  switch (material->which_subclass) {
     // material grid: interpolate onto user specified material grid to get properties at r
-    case material_data::MATERIAL_GRID:
-      double u;
+    case material_data::MATERIAL_GRID: {
+      duals::duald u;
       int oi;
       geom_box_tree tp;
 
       tp = geom_tree_search(p, restricted_tree, &oi);
       // interpolate and project onto material grid
-      u = tanh_projection(matgrid_val(p, tp, oi, md) + this->u_p, md->beta, md->eta);
+      u = tanh_projection(matgrid_val(p, tp, oi, material), material->beta, material->eta);
       // interpolate material from material grid point
-      epsilon_material_grid(md, u);
+      epsilon_material_grid(material, u.rpart());
 
       return;
+    }
     // material read from file: interpolate to get properties at r
     case material_data::MATERIAL_FILE:
-      if (md->epsilon_data)
-        epsilon_file_material(md, p);
+      if (material->epsilon_data)
+        epsilon_file_material(material, p);
       else
         material = (material_type)default_material;
       return;
@@ -877,16 +940,16 @@ void geom_epsilon::get_material_pt(material_type &material, const meep::vec &r)
     // the user's function only needs to fill in whatever is
     // different from vacuum.
     case material_data::MATERIAL_USER:
-      md->medium = medium_struct();
-      md->user_func(p, md->user_data, &(md->medium));
-      md->medium.check_offdiag_im_zero_or_abort();
+      material->medium = medium_struct();
+      material->user_func(p, material->user_data, &(material->medium));
+      material->medium.check_offdiag_im_zero_or_abort();
       return;
 
     // position-independent material or metal: there is nothing to do
     case material_data::MEDIUM:
     case material_data::PERFECT_METAL: return;
 
-    default: meep::abort("unknown material type");
+    default: meep::abort("unknown material type in eval_material_pt");
   };
 }
 
@@ -907,20 +970,21 @@ double geom_epsilon::chi1p1(meep::field_type ft, const meep::vec &r) {
   material_epsmu(ft, material, &chi1p1, &chi1p1_inv);
   material_gc(material);
 
-  return (chi1p1.m00 + chi1p1.m11 + chi1p1.m22) / 3;
+  return (chi1p1.m00.rpart() + chi1p1.m11.rpart() + chi1p1.m22.rpart()) / 3;
 }
 
 /* Find frontmost object in v, along with the constant material behind it.
-   Returns false if material behind the object is not constant.
+   Returns false if more than two objects & materials intersect the pixel.
 
    Requires moderately horrifying logic to figure things out properly,
    stolen from MPB. */
 static bool get_front_object(const meep::volume &v, geom_box_tree geometry_tree, vector3 &pcenter,
                              const geometric_object **o_front, vector3 &shiftby_front,
-                             material_type &mat_front, material_type &mat_behind) {
+                             material_type &mat_front, material_type &mat_behind, vector3 &p_front,
+                             vector3 &p_behind) {
   vector3 p;
   const geometric_object *o1 = 0, *o2 = 0;
-  vector3 shiftby1 = {0, 0, 0}, shiftby2 = {0, 0, 0};
+  vector3 shiftby1 = {0, 0, 0}, shiftby2 = {0, 0, 0}, p1 = {0, 0, 0}, p2 = {0, 0, 0};
   geom_box pixel;
   material_type mat1 = vacuum, mat2 = vacuum;
   int id1 = -1, id2 = -1;
@@ -982,6 +1046,7 @@ static bool get_front_object(const meep::volume &v, geom_box_tree geometry_tree,
       shiftby1 = shiftby;
       id1 = id;
       mat1 = mat;
+      p1 = q;
     }
     else if (id2 == -1 ||
              ((id >= id1 && id >= id2) && (id1 == id2 || material_type_equal(mat1, mat2)))) {
@@ -989,10 +1054,12 @@ static bool get_front_object(const meep::volume &v, geom_box_tree geometry_tree,
       shiftby2 = shiftby;
       id2 = id;
       mat2 = mat;
+      p2 = q;
     }
     else if (!(id1 < id2 && (id1 == id || material_type_equal(mat1, mat))) &&
-             !(id2 < id1 && (id2 == id || material_type_equal(mat2, mat))))
+             !(id2 < id1 && (id2 == id || material_type_equal(mat2, mat)))) {
       return false;
+    }
   }
 
   // CHECK(id1 > -1, "bug in object_of_point_in_tree?");
@@ -1000,130 +1067,128 @@ static bool get_front_object(const meep::volume &v, geom_box_tree geometry_tree,
     id2 = id1;
     o2 = o1;
     mat2 = mat1;
+    p2 = p1;
     shiftby2 = shiftby1;
   }
 
-  if ((o1 && is_variable(o1->material)) || (o2 && is_variable(o2->material)) ||
-      ((is_variable(default_material) || is_file(default_material)) &&
-       (!o1 || is_file(o1->material) || !o2 || is_file(o2->material))))
-    return false;
-
   if (id1 >= id2) {
     *o_front = o1;
     shiftby_front = shiftby1;
     mat_front = mat1;
-    if (id1 == id2)
+    p_front = p1;
+    if (id1 == id2) {
       mat_behind = mat1;
-    else
+      p_behind = p1;
+    }
+    else {
       mat_behind = mat2;
+      p_behind = p2;
+    }
   }
   if (id2 > id1) {
     *o_front = o2;
     shiftby_front = shiftby2;
     mat_front = mat2;
+    p_front = p2;
     mat_behind = mat1;
+    p_behind = p1;
   }
   return true;
 }
 
 void geom_epsilon::eff_chi1inv_row(meep::component c, double chi1inv_row[3], const meep::volume &v,
                                    double tol, int maxeval) {
+  /* for speed reasons, we need to "wrap" the
+  actual eff_chi1inv_row_grad function below, since
+  the original eff_chi1inv_row is a virtual function
+  and doesn't play well with default arguments */
+  eff_chi1inv_row_grad(c, chi1inv_row, v, tol, maxeval, false);
+}
+void geom_epsilon::eff_chi1inv_row_grad(meep::component c, double chi1inv_row[3],
+                                        const meep::volume &v, double tol, int maxeval,
+                                        bool needs_grad) {
   symm_matrix meps_inv;
   bool fallback;
   eff_chi1inv_matrix(c, &meps_inv, v, tol, maxeval, fallback);
 
   if (fallback) { fallback_chi1inv_row(c, chi1inv_row, v, tol, maxeval); }
   else {
-    switch (component_direction(c)) {
-      case meep::X:
-      case meep::R:
-        chi1inv_row[0] = meps_inv.m00;
-        chi1inv_row[1] = meps_inv.m01;
-        chi1inv_row[2] = meps_inv.m02;
-        break;
-      case meep::Y:
-      case meep::P:
-        chi1inv_row[0] = meps_inv.m01;
-        chi1inv_row[1] = meps_inv.m11;
-        chi1inv_row[2] = meps_inv.m12;
-        break;
-      case meep::Z:
-        chi1inv_row[0] = meps_inv.m02;
-        chi1inv_row[1] = meps_inv.m12;
-        chi1inv_row[2] = meps_inv.m22;
-        break;
-      case meep::NO_DIRECTION: chi1inv_row[0] = chi1inv_row[1] = chi1inv_row[2] = 0; break;
+    /* for gradient calculations, we need to return
+    the dual part of the calcuation, which corresponds
+    to the first derivative w.r.t. u_i */
+    if (needs_grad) {
+      switch (component_direction(c)) {
+        case meep::X:
+        case meep::R:
+          chi1inv_row[0] = meps_inv.m00.dpart();
+          chi1inv_row[1] = meps_inv.m01.dpart();
+          chi1inv_row[2] = meps_inv.m02.dpart();
+          break;
+        case meep::Y:
+        case meep::P:
+          chi1inv_row[0] = meps_inv.m01.dpart();
+          chi1inv_row[1] = meps_inv.m11.dpart();
+          chi1inv_row[2] = meps_inv.m12.dpart();
+          break;
+        case meep::Z:
+          chi1inv_row[0] = meps_inv.m02.dpart();
+          chi1inv_row[1] = meps_inv.m12.dpart();
+          chi1inv_row[2] = meps_inv.m22.dpart();
+          break;
+        case meep::NO_DIRECTION: chi1inv_row[0] = chi1inv_row[1] = chi1inv_row[2] = 0; break;
+      }
     }
-  }
-}
-
-void geom_epsilon::eff_chi1inv_matrix(meep::component c, symm_matrix *chi1inv_matrix,
-                                      const meep::volume &v, double tol, int maxeval,
-                                      bool &fallback) {
-  const geometric_object *o;
-  material_type mat, mat_behind;
-  symm_matrix meps;
-  vector3 p, shiftby, normal;
-  fallback = false;
-
-  if (maxeval == 0) {
-  noavg:
-    get_material_pt(mat, v.center());
-  trivial:
-    material_epsmu(meep::type(c), mat, &meps, chi1inv_matrix);
-    material_gc(mat);
-    return;
-  }
-
-  if (!get_front_object(v, geometry_tree, p, &o, shiftby, mat, mat_behind)) {
-    get_material_pt(mat, v.center());
-    if (mat &&
-        (mat->which_subclass == material_data::MATERIAL_USER ||
-         mat->which_subclass == material_data::MATERIAL_GRID) &&
-        mat->do_averaging) {
-      fallback = true;
-      return;
+    else { // no gradient needed; standard epsilon evaluations
+      switch (component_direction(c)) {
+        case meep::X:
+        case meep::R:
+          chi1inv_row[0] = meps_inv.m00.rpart();
+          chi1inv_row[1] = meps_inv.m01.rpart();
+          chi1inv_row[2] = meps_inv.m02.rpart();
+          break;
+        case meep::Y:
+        case meep::P:
+          chi1inv_row[0] = meps_inv.m01.rpart();
+          chi1inv_row[1] = meps_inv.m11.rpart();
+          chi1inv_row[2] = meps_inv.m12.rpart();
+          break;
+        case meep::Z:
+          chi1inv_row[0] = meps_inv.m02.rpart();
+          chi1inv_row[1] = meps_inv.m12.rpart();
+          chi1inv_row[2] = meps_inv.m22.rpart();
+          break;
+        case meep::NO_DIRECTION: chi1inv_row[0] = chi1inv_row[1] = chi1inv_row[2] = 0; break;
+      }
     }
-    else { goto trivial; }
   }
+}
 
-  /* check for trivial case of only one object/material */
-  if (material_type_equal(mat, mat_behind)) goto trivial;
-
-  // it doesn't make sense to average metals (electric or magnetic)
-  if (is_metal(meep::type(c), &mat) || is_metal(meep::type(c), &mat_behind)) goto noavg;
-
-  normal = unit_vector3(normal_to_fixed_object(vector3_minus(p, shiftby), *o));
-  if (normal.x == 0 && normal.y == 0 && normal.z == 0)
-    goto noavg; // couldn't get normal vector for this point, punt
-  geom_box pixel = gv2box(v);
-  pixel.low = vector3_minus(pixel.low, shiftby);
-  pixel.high = vector3_minus(pixel.high, shiftby);
-
-  double fill = box_overlap_with_object(pixel, *o, tol, maxeval);
+void kottke_algorithm(meep::component c, symm_matrix *chi1inv_matrix, symm_matrix &meps,
+                      symm_matrix &eps2, cvector3 &cnormal, duals::duald fill) {
+  /* Ref: " Perturbation theory for anisotropic dielectric interfaces,
+    and application to subpixel smoothing of discretized numerical methods",
+    https://math.mit.edu/~stevenj/papers/KottkeFa08.pdf
+  */
 
-  material_epsmu(meep::type(c), mat, &meps, chi1inv_matrix);
-  symm_matrix eps2, epsinv2;
   symm_matrix eps1, delta;
-  double Rot[3][3];
-  material_epsmu(meep::type(c), mat_behind, &eps2, &epsinv2);
+  duals::duald Rot[3][3];
   eps1 = meps;
 
-  Rot[0][0] = normal.x;
-  Rot[1][0] = normal.y;
-  Rot[2][0] = normal.z;
-  if (fabs(normal.x) > 1e-2 || fabs(normal.y) > 1e-2) {
-    Rot[0][2] = normal.y;
-    Rot[1][2] = -normal.x;
+  Rot[0][0] = cnormal.x.re + 1_e * cnormal.x.im;
+  Rot[1][0] = cnormal.y.re + 1_e * cnormal.y.im;
+  Rot[2][0] = cnormal.z.re + 1_e * cnormal.z.im;
+  if (fabs(cnormal.x.re) > 1e-2 || fabs(cnormal.y.re) > 1e-2) {
+    Rot[0][2] = cnormal.y.re + 1_e * cnormal.y.im;
+    Rot[1][2] = -(cnormal.x.re + 1_e * cnormal.x.im);
     Rot[2][2] = 0;
   }
   else { /* n is not parallel to z direction, use (x x n) instead */
     Rot[0][2] = 0;
-    Rot[1][2] = -normal.z;
-    Rot[2][2] = normal.y;
+    Rot[1][2] = -(cnormal.z.re + 1_e * cnormal.z.im);
+    Rot[2][2] = cnormal.y.re + 1_e * cnormal.y.im;
   }
   { /* normalize second column */
-    double s = Rot[0][2] * Rot[0][2] + Rot[1][2] * Rot[1][2] + Rot[2][2] * Rot[2][2];
+    duals::duald s = Rot[0][2] * Rot[0][2] + Rot[1][2] * Rot[1][2] + Rot[2][2] * Rot[2][2];
     s = 1.0 / sqrt(s);
     Rot[0][2] *= s;
     Rot[1][2] *= s;
@@ -1172,7 +1237,7 @@ void geom_epsilon::eff_chi1inv_matrix(meep::component c, symm_matrix *chi1inv_ma
 
 #define SWAP(a, b)                                                                                 \
   {                                                                                                \
-    double xxx = a;                                                                                \
+    duals::duald xxx = a;                                                                          \
     a = b;                                                                                         \
     b = xxx;                                                                                       \
   }
@@ -1191,59 +1256,264 @@ void geom_epsilon::eff_chi1inv_matrix(meep::component c, symm_matrix *chi1inv_ma
   sym_matrix_invert(chi1inv_matrix, &meps);
 }
 
-static int eps_ever_negative = 0;
-static meep::field_type func_ft = meep::E_stuff;
+duals::duald get_material_grid_fill(meep::ndim dim, duals::duald d, double r, duals::duald u,
+                                    double eta) {
+  /* The fill fraction should describe the amount of u=1 material in the current pixel.
+  Occasionally, the distance to the nearest interface is outside the current
+  sphere, which means we don't need to do any averaging (the fill is 0 or 1). Again,
+  we don't know if that means we are in void or solid, however, until we look
+  at u. To make things easy, we'll assume the cap is solid until the end, when we
+  can verify.
+  */
+  duals::duald rel_fill;
+  if (abs(d) >= abs(r)) {
+    return -1.0; // garbage fill
+  }
+  else {
+    if (dim == meep::D1)
+      return (r - d) / (2 * r);
+    else if (dim == meep::D2 || dim == meep::Dcyl)
+      return (1 / (r * r * meep::pi)) * (r * r * acos(d / r) - d * sqrt(r * r - d * d));
+    else if (dim == meep::D3)
+      return (((r - d) * (r - d)) / (4 * meep::pi * r * r * r)) * (2 * r + d);
+  }
 
-struct matgrid_volavg {
-  meep::ndim dim;   // dimensionality of voxel
-  double rad;       // (spherical) voxel radius
-  double uval;      // bilinearly-interpolated weight at voxel center
-  double ugrad_abs; // magnitude of gradient of uval
-  double beta;      // thresholding bias
-  double eta;       // thresholding erosion/dilation
-  double eps1;      // trace of epsilon tensor from medium 1
-  double eps2;      // trace of epsilon tensor from medium 2
-};
-
-static void get_uproj_w(const matgrid_volavg *mgva, double x0, double &u_proj, double &w) {
-  // use a linear approximation for the material grid weights around the Yee grid point
-  u_proj = tanh_projection(mgva->uval + mgva->ugrad_abs * x0, mgva->beta, mgva->eta);
-  if (mgva->dim == meep::D1)
-    w = 1 / (2 * mgva->rad);
-  else if (mgva->dim == meep::D2 || mgva->dim == meep::Dcyl)
-    w = 2 * sqrt(mgva->rad * mgva->rad - x0 * x0) / (meep::pi * mgva->rad * mgva->rad);
-  else if (mgva->dim == meep::D3)
-    w = meep::pi * (mgva->rad * mgva->rad - x0 * x0) /
-        (4 / 3 * meep::pi * mgva->rad * mgva->rad * mgva->rad);
+  return rel_fill;
 }
 
-#ifdef CTL_HAS_COMPLEX_INTEGRATION
-static cnumber matgrid_ceps_func(int n, number *x, void *mgva_) {
-  (void)n; // unused
-  double u_proj = 0, w = 0;
-  matgrid_volavg *mgva = (matgrid_volavg *)mgva_;
-  get_uproj_w(mgva, x[0], u_proj, w);
-  cnumber ret;
-  ret.re = (1 - u_proj) * mgva->eps1 + u_proj * mgva->eps2;
-  ret.im = (1 - u_proj) / mgva->eps1 + u_proj / mgva->eps2;
-  return ret * w;
+duals::duald normalize_dual(cvector3 &cv) {
+  duals::duald d[3], norm;
+  // compute the norm, u_0'
+  d[0] = cv.x.re + 1_e * cv.x.im;
+  d[1] = cv.y.re + 1_e * cv.y.im;
+  d[2] = cv.z.re + 1_e * cv.z.im;
+
+  norm = sqrt(abs(d[0]) * abs(d[0]) + abs(d[1]) * abs(d[1]) + abs(d[2]) * abs(d[2]));
+
+  // normalize the normal vector
+  d[0] = d[0] / norm;
+  d[1] = d[1] / norm;
+  d[2] = d[2] / norm;
+
+  cv.x.re = d[0].rpart();
+  cv.x.im = d[0].dpart();
+  cv.y.re = d[1].rpart();
+  cv.y.im = d[1].dpart();
+  cv.z.re = d[2].rpart();
+  cv.z.im = d[2].dpart();
+
+  return norm;
 }
-#else
-static number matgrid_eps_func(int n, number *x, void *mgva_) {
-  (void)n; // unused
-  double u_proj = 0, w = 0;
-  matgrid_volavg *mgva = (matgrid_volavg *)mgva_;
-  get_uproj_w(mgva, x[0], u_proj, w);
-  return w * ((1 - u_proj) * mgva->eps1 + u_proj * mgva->eps2);
-}
-static number matgrid_inveps_func(int n, number *x, void *mgva_) {
-  (void)n; // unused
-  double u_proj = 0, w = 0;
-  matgrid_volavg *mgva = (matgrid_volavg *)mgva_;
-  get_uproj_w(mgva, x[0], u_proj, w);
-  return w * ((1 - u_proj) / mgva->eps1 + u_proj / mgva->eps2);
+
+duals::duald get_distance(cvector3 &cv, duals::duald uval, double eta) {
+  duals::duald d[3], norm;
+  d[0] = cv.x.re + 1_e * cv.x.im;
+  d[1] = cv.y.re + 1_e * cv.y.im;
+  d[2] = cv.z.re + 1_e * cv.z.im;
+
+  norm = sqrt(abs(d[0]) * abs(d[0]) + abs(d[1]) * abs(d[1]) + abs(d[2]) * abs(d[2]));
+  return (eta - uval) / norm;
 }
-#endif
+
+void dual_interpolate_mat(material_type mat, symm_matrix &eps, duals::duald u) {
+  duals::duald u_bar = tanh_projection(u, mat->beta, mat->eta); // projection
+  if (std::isnan(u_bar.dpart()))
+    u_bar = u_bar.rpart(); // when beta=inf, we need to correct when gradient breaks down
+  eps.m00 = mat->medium_1.epsilon_diag.x +
+            u_bar * (mat->medium_2.epsilon_diag.x - mat->medium_1.epsilon_diag.x);
+  eps.m11 = mat->medium_1.epsilon_diag.y +
+            u_bar * (mat->medium_2.epsilon_diag.y - mat->medium_1.epsilon_diag.y);
+  eps.m22 = mat->medium_1.epsilon_diag.z +
+            u_bar * (mat->medium_2.epsilon_diag.z - mat->medium_1.epsilon_diag.z);
+  eps.m01 = mat->medium_1.epsilon_offdiag.x.re +
+            u_bar * (mat->medium_2.epsilon_offdiag.x.re - mat->medium_1.epsilon_offdiag.x.re);
+  eps.m02 = mat->medium_1.epsilon_offdiag.y.re +
+            u_bar * (mat->medium_2.epsilon_offdiag.y.re - mat->medium_1.epsilon_offdiag.y.re);
+  eps.m12 = mat->medium_1.epsilon_offdiag.z.re +
+            u_bar * (mat->medium_2.epsilon_offdiag.z.re - mat->medium_1.epsilon_offdiag.z.re);
+}
+
+template<typename T>
+static bool AreEqual(T f1, T f2) { 
+  return (std::fabs(f1 - f2) <= 2*std::numeric_limits<T>::epsilon() * std::fmax(std::fabs(f1), std::fabs(f2)));
+}
+
+void geom_epsilon::eff_chi1inv_matrix(meep::component c, symm_matrix *chi1inv_matrix,
+                                      const meep::volume &v, double tol, int maxeval,
+                                      bool &fallback) {
+  const geometric_object *o;
+  material_type mat, mat_behind;
+  vector3 p_mat, p_mat_behind, p, shiftby;
+  symm_matrix meps, eps2, epsinv2;
+  cvector3 normal;
+  duals::duald fill, uval;
+  int oi;
+  geom_box_tree tp;
+  fallback = false;
+
+  /* first let's check if our pixel has more than a single
+  interface (i.e. a corner) in which case we abandon any
+  smoothing efforts*/
+  if (!get_front_object(v, geometry_tree, p, &o, shiftby, mat, mat_behind, p_mat, p_mat_behind)) {
+  noavg:
+    get_material_pt(mat, v.center());
+  trivial:
+    material_epsmu(meep::type(c), mat, &meps, chi1inv_matrix);
+    material_gc(mat);
+    return;
+  }
+
+  /* we may have entered this routine (e.g. because of a material grid
+  backprop routine) but don't want to actually do any averaging */
+  if (maxeval == 0) {
+    get_material_pt(mat, v.center());
+    if (is_material_grid(mat)) {
+      tp = geom_tree_search(vec_to_vector3(v.center()), restricted_tree, &oi);
+      uval = matgrid_val(vec_to_vector3(v.center()), tp, oi, mat);
+    mgavg:
+      dual_interpolate_mat(mat, meps, uval);
+      sym_matrix_invert(chi1inv_matrix, &meps);
+      // get_material_pt(mat, v.center());
+      // material_epsmu(meep::type(c), mat, &meps, chi1inv_matrix);
+      material_gc(mat);
+      return;
+    }
+    else { goto trivial; }
+  }
+
+  // For variable materials with do_averaging == true, switch over to slow fallback integration
+  // method. For material grids, however, we have to do some more logic first...
+  if ((is_variable(mat) && mat->do_averaging) ||
+      (is_variable(mat_behind) && mat_behind->do_averaging)) {
+    if ((!is_material_grid(mat)) && (!is_material_grid(mat_behind))) {
+      fallback = true;
+      return;
+    }
+  }
+
+  /* ------------------------------------------- */
+  // One material
+  /* ------------------------------------------- */
+  /* check for case of only one object/material
+   in the case of the material grid, make sure we don't need
+   to do any interface averaging within.
+   */
+  if (material_type_equal(mat, mat_behind)) {
+    /* if we have a material grid, we need to calculate the fill fraction,
+    normal vector, etc. we also need to determine if we really need
+    to do any averaging, or if the current pixel is smooth enough
+    to just evaluate.
+    */
+    if (is_material_grid(mat)) {
+      // if we have a damping term and β≠∞, we can't smooth
+      if ((mat->damping != 0) && (mat->eta != std::numeric_limits<double>::infinity())) goto noavg;
+
+      tp = geom_tree_search(p_mat, restricted_tree, &oi);
+      uval = matgrid_val(p_mat, tp, oi, mat);
+      normal = matgrid_grad(p_mat, tp, oi, mat);
+      double TOL = 4*std::numeric_limits<double>::epsilon();
+      
+      if (AreEqual(cvector3_re(normal).x,0.0) && 
+          AreEqual(cvector3_re(normal).y,0.0) && 
+          AreEqual(cvector3_re(normal).z,0.0)
+        )
+        /* couldn't get normal vector for this point; no averaging is needed,
+        but we still need to interpolate onto our grid using duals */
+        goto mgavg;
+      duals::duald u_prime = normalize_dual(normal);
+      duals::duald d;
+      /* we need to be careful when mat->eta = uval and u_prime is very small...*/
+      d = (mat->eta - uval) / u_prime;
+      double r = v.diameter() / 2;
+      fill = get_material_grid_fill(v.dim, d, r, uval, mat->eta);
+      if (fill < 0) {
+        /* we are far enough away from an interface that
+        we don't need any averaging -- but we still need
+        to interpolate from the material grid */
+        goto mgavg;
+      }
+      else {
+        /* we have a material grid interface within our pixel.
+        we therefore need to set the two materials used for
+        averaging to our corresponding solid and void materials.
+        */
+        symm_matrix eps_0, eps_plus, eps_minus;
+        dual_interpolate_mat(mat, eps_0, uval);
+        dual_interpolate_mat(mat, eps_plus, uval + r * u_prime);
+        dual_interpolate_mat(mat, eps_minus, uval - r * u_prime);
+
+        if (d > 0) {
+          // Case 1: d > 0. Then let ε̃₊ = ε₊ and let ε̃₋ = [ε₀d + ε₋(Δx/2 – d)] / (Δx/2).
+          meps = eps_plus;
+          eps2 = (eps_0 * d + eps_minus * (r - d)) / (r);
+        }
+        else {
+          // Case 2: d < 0. Then let ε̃₋ = ε₋ and let ε̃₊ = [-ε₀d + ε₊(Δx/2 + d)] / (Δx/2).
+          eps2 = eps_minus;
+          meps = (-eps_0 * d + eps_plus * (r + d)) / (r);
+        }
+      }
+    }
+    else if (is_variable(mat)) {
+      // no averaging is needed (not a material grid)
+      eval_material_pt(mat, vec_to_vector3(v.center()));
+      goto trivial;
+      // materials are non variable and uniform -- no need to average
+    }
+    else { goto trivial; }
+    /* ------------------------------------------- */
+    // Two different materials (perhaps a m.g. and a geom object)
+    /* ------------------------------------------- */
+  }
+  else {
+    vector3 normal_re = unit_vector3(normal_to_fixed_object(vector3_minus(p, shiftby), *o));
+    if (normal_re.x == 0 && normal_re.y == 0 && normal_re.z == 0)
+      goto noavg; // couldn't get normal vector for this point, punt
+    normal = make_cvector3(normal_re, make_vector3());
+    geom_box pixel = gv2box(v);
+    pixel.low = vector3_minus(pixel.low, shiftby);
+    pixel.high = vector3_minus(pixel.high, shiftby);
+    fill = duals::duald(box_overlap_with_object(pixel, *o, tol, maxeval), 0);
+    /* Evaluate materials in case they are variable.  This allows us to do fast subpixel averaging
+      at the boundary of an object with a variable material, while remaining accurate enough if the
+      material is continuous over the pixel.  (We make a first-order error by averaging as if the
+      material were constant, but only in a boundary layer of thickness 1/resolution, so the net
+      effect should still be second-order.) */
+    if (is_variable(mat)) {
+      eval_material_pt(mat, p_mat);
+      eval_material_pt(mat_behind, p_mat_behind);
+      if (material_type_equal(mat, mat_behind)) goto trivial;
+    }
+
+    /* to properly determine the tensors of a material
+      that has a material grid, we need to be careful.
+      Specifically, we need to include the effects of u_i
+      in the dual part of the material tensor */
+    if (is_material_grid(mat)) {
+      tp = geom_tree_search(p_mat, restricted_tree, &oi);
+      duals::duald uval = matgrid_val(p_mat, tp, oi, mat);
+      dual_interpolate_mat(mat, meps, uval);
+      sym_matrix_invert(chi1inv_matrix, &meps);
+    }
+    else { material_epsmu(meep::type(c), mat, &meps, chi1inv_matrix); }
+    if (is_material_grid(mat_behind)) {
+      tp = geom_tree_search(p_mat_behind, restricted_tree, &oi);
+      duals::duald uval = matgrid_val(p_mat_behind, tp, oi, mat);
+      dual_interpolate_mat(mat_behind, eps2, uval);
+      sym_matrix_invert(&epsinv2, &eps2); // not really needed
+    }
+    else { material_epsmu(meep::type(c), mat_behind, &eps2, &epsinv2); }
+  }
+
+  // it doesn't make sense to average metals (electric or magnetic)
+  if (is_metal(meep::type(c), &mat) || is_metal(meep::type(c), &mat_behind)) goto noavg;
+
+  kottke_algorithm(c, chi1inv_matrix, meps, eps2, normal, fill);
+  material_gc(mat);
+}
+
+static int eps_ever_negative = 0;
+static meep::field_type func_ft = meep::E_stuff;
 
 #ifdef CTL_HAS_COMPLEX_INTEGRATION
 static cnumber ceps_func(int n, number *x, void *geomeps_) {
@@ -1314,18 +1584,7 @@ void geom_epsilon::fallback_chi1inv_row(meep::component c, double chi1inv_row[3]
   material =
       (material_type)material_of_unshifted_point_in_tree_inobject(p, restricted_tree, &inobject);
   material_data *md = material;
-  meep::vec gradient(zero_vec(v.dim));
-  double uval = 0;
-
-  if (md->which_subclass == material_data::MATERIAL_GRID) {
-    geom_box_tree tp;
-    int oi;
-    tp = geom_tree_search(p, restricted_tree, &oi);
-    gradient = matgrid_grad(p, tp, oi, md);
-    uval = matgrid_val(p, tp, oi, md) + this->u_p;
-  }
-  else { gradient = normal_vector(meep::type(c), v); }
-
+  meep::vec gradient = normal_vector(meep::type(c), v);
   get_material_pt(material, v.center());
   material_epsmu(meep::type(c), material, &chi1p1, &chi1p1_inv);
   material_gc(material);
@@ -1333,19 +1592,19 @@ void geom_epsilon::fallback_chi1inv_row(meep::component c, double chi1inv_row[3]
       chi1p1.m11 != chi1p1.m22 || chi1p1.m00 != chi1p1.m22 || meep::abs(gradient) < 1e-8) {
     int rownum = meep::component_direction(c) % 3;
     if (rownum == 0) {
-      chi1inv_row[0] = chi1p1_inv.m00;
-      chi1inv_row[1] = chi1p1_inv.m01;
-      chi1inv_row[2] = chi1p1_inv.m02;
+      chi1inv_row[0] = chi1p1_inv.m00.rpart();
+      chi1inv_row[1] = chi1p1_inv.m01.rpart();
+      chi1inv_row[2] = chi1p1_inv.m02.rpart();
     }
     else if (rownum == 1) {
-      chi1inv_row[0] = chi1p1_inv.m01;
-      chi1inv_row[1] = chi1p1_inv.m11;
-      chi1inv_row[2] = chi1p1_inv.m12;
+      chi1inv_row[0] = chi1p1_inv.m01.rpart();
+      chi1inv_row[1] = chi1p1_inv.m11.rpart();
+      chi1inv_row[2] = chi1p1_inv.m12.rpart();
     }
     else {
-      chi1inv_row[0] = chi1p1_inv.m02;
-      chi1inv_row[1] = chi1p1_inv.m12;
-      chi1inv_row[2] = chi1p1_inv.m22;
+      chi1inv_row[0] = chi1p1_inv.m02.rpart();
+      chi1inv_row[1] = chi1p1_inv.m12.rpart();
+      chi1inv_row[2] = chi1p1_inv.m22.rpart();
     }
     return;
   }
@@ -1353,79 +1612,49 @@ void geom_epsilon::fallback_chi1inv_row(meep::component c, double chi1inv_row[3]
   integer errflag;
   double meps, minveps;
 
-  if (md->which_subclass == material_data::MATERIAL_GRID) {
-    number xmin[1], xmax[1];
-    matgrid_volavg mgva;
-    mgva.dim = v.dim;
-    mgva.ugrad_abs = meep::abs(gradient);
-    mgva.uval = uval;
-    mgva.rad = v.diameter() / 2;
-    mgva.beta = md->beta;
-    mgva.eta = md->eta;
-    mgva.eps1 =
-        (md->medium_1.epsilon_diag.x + md->medium_1.epsilon_diag.y + md->medium_1.epsilon_diag.z) /
-        3;
-    mgva.eps2 =
-        (md->medium_2.epsilon_diag.x + md->medium_2.epsilon_diag.y + md->medium_2.epsilon_diag.z) /
-        3;
-    xmin[0] = -v.diameter() / 2;
-    xmax[0] = v.diameter() / 2;
-#ifdef CTL_HAS_COMPLEX_INTEGRATION
-    cnumber ret = cadaptive_integration(matgrid_ceps_func, xmin, xmax, 1, (void *)&mgva, 0, tol,
-                                        maxeval, &esterr, &errflag);
-    meps = ret.re;
-    minveps = ret.im;
-#else
-    meps = adaptive_integration(matgrid_eps_func, xmin, xmax, 1, (void *)&mgva, 0, tol, maxeval,
-                                &esterr, &errflag);
-    minveps = adaptive_integration(matgrid_inveps_func, xmin, xmax, 1, (void *)&mgva, 0, tol,
-                                   maxeval, &esterr, &errflag);
-#endif
+  integer n;
+  number xmin[3], xmax[3];
+  vector3 gvmin, gvmax;
+  gvmin = vec_to_vector3(v.get_min_corner());
+  gvmax = vec_to_vector3(v.get_max_corner());
+  xmin[0] = gvmin.x;
+  xmax[0] = gvmax.x;
+  if (dim == meep::Dcyl) {
+    xmin[1] = gvmin.z;
+    xmin[2] = gvmin.y;
+    xmax[1] = gvmax.z;
+    xmax[2] = gvmax.y;
   }
   else {
-    integer n;
-    number xmin[3], xmax[3];
-    vector3 gvmin, gvmax;
-    gvmin = vec_to_vector3(v.get_min_corner());
-    gvmax = vec_to_vector3(v.get_max_corner());
-    xmin[0] = gvmin.x;
-    xmax[0] = gvmax.x;
-    if (dim == meep::Dcyl) {
-      xmin[1] = gvmin.z;
-      xmin[2] = gvmin.y;
-      xmax[1] = gvmax.z;
-      xmax[2] = gvmax.y;
-    }
-    else {
-      xmin[1] = gvmin.y;
-      xmin[2] = gvmin.z;
-      xmax[1] = gvmax.y;
-      xmax[2] = gvmax.z;
-    }
-    if (xmin[2] == xmax[2])
-      n = xmin[1] == xmax[1] ? 1 : 2;
-    else
-      n = 3;
-    double vol = 1;
-    for (int i = 0; i < n; ++i)
-      vol *= xmax[i] - xmin[i];
-    if (dim == meep::Dcyl) vol *= (xmin[0] + xmax[0]) * 0.5;
-    eps_ever_negative = 0;
-    func_ft = meep::type(c);
+    xmin[1] = gvmin.y;
+    xmin[2] = gvmin.z;
+    xmax[1] = gvmax.y;
+    xmax[2] = gvmax.z;
+  }
+  if (xmin[2] == xmax[2])
+    n = xmin[1] == xmax[1] ? 1 : 2;
+  else
+    n = 3;
+  double vol = 1;
+  for (int i = 0; i < n; ++i)
+    vol *= xmax[i] - xmin[i];
+  if (dim == meep::Dcyl) vol *= (xmin[0] + xmax[0]) * 0.5;
+  eps_ever_negative = 0;
+  func_ft = meep::type(c);
 #ifdef CTL_HAS_COMPLEX_INTEGRATION
-    cnumber ret = cadaptive_integration(ceps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval,
-                                        &esterr, &errflag);
-    meps = ret.re / vol;
-    minveps = ret.im / vol;
+  cnumber ret = cadaptive_integration(ceps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval,
+                                      &esterr, &errflag);
+  meps = ret.re / vol;
+  minveps = ret.im / vol;
 #else
-    meps = adaptive_integration(eps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval, &esterr,
-                                &errflag) /
-           vol;
-    minveps = adaptive_integration(inveps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval,
-                                   &esterr, &errflag) /
-              vol;
+  meps = adaptive_integration(eps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval, &esterr,
+                              &errflag) /
+         vol;
+  minveps = adaptive_integration(inveps_func, xmin, xmax, n, (void *)this, 0, tol, maxeval, &esterr,
+                                 &errflag) /
+            vol;
 #endif
-  }
+
   if (eps_ever_negative) // averaging negative eps causes instability
     minveps = 1.0 / (meps = eps(v.center()));
   {
@@ -1667,7 +1896,7 @@ void geom_epsilon::sigma_row(meep::component c, double sigrow[3], const meep::ve
 
     tp = geom_tree_search(p, restricted_tree, &oi);
     // interpolate and project onto material grid
-    u = tanh_projection(matgrid_val(p, tp, oi, mat) + this->u_p, mat->beta, mat->eta);
+    u = tanh_projection(matgrid_val(p, tp, oi, mat), mat->beta, mat->eta).rpart();
     epsilon_material_grid(mat, u); // interpolate material from material grid point
     mat->medium.check_offdiag_im_zero_or_abort();
   }
@@ -1936,7 +2165,8 @@ void add_absorbing_layer(absorber_list alist, double thickness, int direction, i
 if needed */
 geom_epsilon *make_geom_epsilon(meep::structure *s, geometric_object_list *g, vector3 center,
                                 bool _ensure_periodicity, material_type _default_material,
-                                material_type_list extra_materials) {
+                                material_type_list extra_materials,
+                                bool _use_anisotropic_averaging) {
   // set global variables in libctlgeom based on data fields in s
   geom_initialize();
   geometry_center = center;
@@ -1987,6 +2217,7 @@ geom_epsilon *make_geom_epsilon(meep::structure *s, geometric_object_list *g, ve
   }
 
   geom_epsilon *geps = new geom_epsilon(*g, extra_materials, gv.pad().surroundings());
+  geps->use_anisotropic_averaging = _use_anisotropic_averaging;
   return geps;
 }
 
@@ -1996,8 +2227,9 @@ void set_materials_from_geometry(meep::structure *s, geometric_object_list g, ve
                                  bool use_anisotropic_averaging, double tol, int maxeval,
                                  bool _ensure_periodicity, material_type _default_material,
                                  absorber_list alist, material_type_list extra_materials) {
-  meep_geom::geom_epsilon *geps = meep_geom::make_geom_epsilon(s, &g, center, _ensure_periodicity,
-                                                               _default_material, extra_materials);
+  meep_geom::geom_epsilon *geps =
+      meep_geom::make_geom_epsilon(s, &g, center, _ensure_periodicity, _default_material,
+                                   extra_materials, use_anisotropic_averaging);
   set_materials_from_geom_epsilon(s, geps, use_anisotropic_averaging, tol, maxeval, alist);
   delete geps;
 }
@@ -2060,6 +2292,7 @@ material_type make_user_material(user_material_func user_func, void *user_data,
 material_type make_file_material(const char *eps_input_file) {
   material_data *md = new material_data();
   md->which_subclass = material_data::MATERIAL_FILE;
+  md->do_averaging = false;
 
   md->epsilon_dims[0] = md->epsilon_dims[1] = md->epsilon_dims[2] = 1;
   if (eps_input_file && eps_input_file[0]) { // file specified
@@ -2095,7 +2328,9 @@ material_type make_material_grid(bool do_averaging, double beta, double eta, dou
 
 void update_weights(material_type matgrid, double *weights) {
   size_t N = matgrid->grid_size.x * matgrid->grid_size.y * matgrid->grid_size.z;
-  memcpy(matgrid->weights, weights, N * sizeof(double));
+  // memcpy(matgrid->weights, weights, N * sizeof(double));
+  for (size_t i = 0; i < N; i++)
+    matgrid->weights.push_back(weights[i]);
 }
 
 /******************************************************************************/
@@ -2625,7 +2860,7 @@ get_material_gradient(const meep::vec &r,              // current point
                       geom_epsilon *geps,              // material
                       meep::grid_volume &gv,           // simulation grid volume
                       double du,                       // step size
-                      double *u,                       // matgrid
+                      std::vector<duals::duald> &u,    // matgrid
                       int idx                          // matgrid index
 ) {
   /*Compute the Aᵤx product from the -λᵀAᵤx calculation.
@@ -2649,18 +2884,21 @@ get_material_gradient(const meep::vec &r,              // current point
 
   // locate the proper material
   material_type md;
+  if (!gv.contains(r)) meep::abort("Location (%f, %f, %f) is outside of grid volume\n",r.x(),r.y(),r.z());
   geps->get_material_pt(md, r);
 
+  // get the tensor column index corresponding to the forward component
   int dir_idx = 0;
-  if (forward_c == meep::Dx || forward_c == meep::Dr)
-    dir_idx = 0;
-  else if (forward_c == meep::Dy || forward_c == meep::Dp)
-    dir_idx = 1;
-  else if (forward_c == meep::Dz)
-    dir_idx = 2;
-  else
-    meep::abort("Invalid adjoint field component");
+  switch (meep::component_direction(forward_c)) {
+    case meep::X:
+    case meep::R: dir_idx = 0; break;
+    case meep::Y:
+    case meep::P: dir_idx = 1; break;
+    case meep::Z: dir_idx = 2; break;
+    case meep::NO_DIRECTION: meep::abort("Invalid forward component!\n");
+  }
 
+  // materials are non-dispersive
   if (md->trivial) {
     const double sd = 1.0; // FIXME: make user-changable?
     meep::volume v(r);
@@ -2668,30 +2906,29 @@ get_material_gradient(const meep::vec &r,              // current point
       v.set_direction_min(d, r.in_direction(d) - 0.5 * gv.inva * sd);
       v.set_direction_max(d, r.in_direction(d) + 0.5 * gv.inva * sd);
     }
-    double row_1[3], row_2[3], dA_du[3];
-    double orig = u[idx];
-    u[idx] -= du;
-    geps->eff_chi1inv_row(adjoint_c, row_1, v, geps->tol, geps->maxeval);
-    u[idx] += 2 * du;
-    geps->eff_chi1inv_row(adjoint_c, row_2, v, geps->tol, geps->maxeval);
-    u[idx] = orig;
-
-    for (int i = 0; i < 3; i++)
-      dA_du[i] = (row_1[i] - row_2[i]) / (2 * du);
-    return dA_du[dir_idx] * fields_f;
-  }
+    double dA_du[3];
+    double maxevals = geps->use_anisotropic_averaging ? geps->maxeval : 0; // punt if no smoothing
+    /* we can calculate the gradient w.r.t. u_i using
+    a dual method. We just need to "flag" the index
+    within our weights array with a dual value
+    of "1", and the duals library will take care of the
+    rest. */
+    u[idx] += 1_e; // add dual component
+    geps->eff_chi1inv_row_grad(adjoint_c, dA_du, v, geps->tol, maxevals, true);
+    u[idx] -= 1_e; // remove dual component
+    return -dA_du[dir_idx] * fields_f; // note the minus sign! (from Steven's adjoint notes)
+  }
+  // materials have some dispersion
   else {
-    double orig = u[idx];
     std::complex<double> row_1[3], row_2[3], dA_du[3];
     u[idx] -= du;
     eff_chi1inv_row_disp(adjoint_c, row_1, r, freq, geps);
     u[idx] += 2 * du;
     eff_chi1inv_row_disp(adjoint_c, row_2, r, freq, geps);
-    u[idx] = orig;
+    u[idx] -= du;
 
-    for (int i = 0; i < 3; i++)
-      dA_du[i] = (row_1[i] - row_2[i]) / (2 * du);
-    return dA_du[dir_idx] * fields_f * cond_cmp(forward_c, r, freq, geps);
+    return fields_f * (row_1[dir_idx] - row_2[dir_idx]) / (2 * du) *
+           cond_cmp(forward_c, r, freq, geps);
   }
 }
 
@@ -2703,13 +2940,14 @@ much more complicated and it is easier to calculate the entire gradient
 using finite differences (at the cost of slightly less accurate gradients
 due to floating-point roundoff errors). */
 void add_interpolate_weights(double rx, double ry, double rz, double *data, int nx, int ny, int nz,
-                             int stride, double scaleby, double *udata, int ukind, double uval,
-                             meep::vec r, geom_epsilon *geps, meep::component adjoint_c,
-                             meep::component forward_c, std::complex<double> fwd,
-                             std::complex<double> adj, double freq, meep::grid_volume &gv,
-                             double du) {
+                             int stride, double scaleby, std::vector<duals::duald> &udata,
+                             int ukind, duals::duald uval, meep::vec r, geom_epsilon *geps,
+                             meep::component adjoint_c, meep::component forward_c,
+                             std::complex<double> fwd, std::complex<double> adj, double freq,
+                             meep::grid_volume &gv, double du) {
   int x1, y1, z1, x2, y2, z2;
-  double dx, dy, dz, u;
+  double dx, dy, dz;
+  duals::duald u;
 
   meep::map_coordinates(rx, ry, rz, nx, ny, nz, x1, y1, z1, x2, y2, z2, dx, dy, dz);
   int x_list[2] = {x1, x2}, y_list[2] = {y1, y2}, z_list[2] = {z1, z2};
@@ -2720,8 +2958,8 @@ void add_interpolate_weights(double rx, double ry, double rz, double *data, int
 /* define a macro to give us data(x,y,z) on the grid,
 in row-major order (the order used by HDF5): */
 #define IDX(x, y, z) (((x)*ny + (y)) * nz + (z)) * stride
-#define D(x, y, z) (data[(((x)*ny + (y)) * nz + (z)) * stride])
-#define U(x, y, z) (udata[(((x)*ny + (y)) * nz + (z)) * stride])
+#define D(x, y, z) (data[IDX(x, y, z)])
+#define U(x, y, z) (udata[IDX(x, y, z)])
 
   u = (((U(x1, y1, z1) * (1.0 - dx) + U(x2, y1, z1) * dx) * (1.0 - dy) +
         (U(x1, y2, z1) * (1.0 - dx) + U(x2, y2, z1) * dx) * dy) *
@@ -2731,7 +2969,7 @@ in row-major order (the order used by HDF5): */
            dz);
 
   if (ukind == material_data::U_MIN && u != uval) return; // TODO look into this
-  if (ukind == material_data::U_PROD) scaleby *= uval / u;
+  if (ukind == material_data::U_PROD) scaleby *= uval.rpart() / u.rpart();
 
   for (int xi = 0; xi < lx; xi++) {
     for (int yi = 0; yi < ly; yi++) {
@@ -2757,7 +2995,7 @@ void material_grids_addgradient_point(double *v, vector3 p, double scalegrad, ge
   geom_box_tree tp;
   int oi, ois;
   material_data *mg, *mg_sum;
-  double uval;
+  duals::duald uval;
   int kind;
   tp = geom_tree_search(p, geps->geometry_tree, &oi);
 
@@ -2804,13 +3042,12 @@ void material_grids_addgradient_point(double *v, vector3 p, double scalegrad, ge
     it's up to the user to only have one unique design grid in this volume.*/
     vector3 sz = mg->grid_size;
     double *vcur = v;
-    double *ucur = mg->weights;
     uval = tanh_projection(matgrid_val(p, tp, oi, mg), mg->beta, mg->eta);
     do {
       vector3 pb = to_geom_box_coords(p, &tp->objects[oi]);
-      add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, kind,
-                              uval, vector3_to_vec(p), geps, adjoint_c, forward_c, fwd, adj, freq,
-                              gv, tol);
+      add_interpolate_weights(pb.x, pb.y, pb.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, mg->weights,
+                              kind, uval, vector3_to_vec(p), geps, adjoint_c, forward_c, fwd, adj,
+                              freq, gv, tol);
       if (kind == material_data::U_DEFAULT) break;
       tp = geom_tree_search_next(p, tp, &oi);
     } while (tp && is_material_grid((material_data *)tp->objects[oi].o->material));
@@ -2820,10 +3057,10 @@ void material_grids_addgradient_point(double *v, vector3 p, double scalegrad, ge
     map_lattice_coordinates(p.x, p.y, p.z);
     vector3 sz = mg->grid_size;
     double *vcur = v;
-    double *ucur = mg->weights;
     uval = tanh_projection(material_grid_val(p, mg), mg->beta, mg->eta);
-    add_interpolate_weights(p.x, p.y, p.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, ucur, kind, uval,
-                            vector3_to_vec(p), geps, adjoint_c, forward_c, fwd, adj, freq, gv, tol);
+    add_interpolate_weights(p.x, p.y, p.z, vcur, sz.x, sz.y, sz.z, 1, scalegrad, mg->weights, kind,
+                            uval, vector3_to_vec(p), geps, adjoint_c, forward_c, fwd, adj, freq, gv,
+                            tol);
   }
 }
 
diff --git a/src/meepgeom.hpp b/src/meepgeom.hpp
index ef77c74ac..d5f0f6ef1 100644
--- a/src/meepgeom.hpp
+++ b/src/meepgeom.hpp
@@ -153,9 +153,71 @@ inline vector3 make_vector3(double x = 0.0, double y = 0.0, double z = 0.0) {
   return v;
 }
 
-typedef struct {
-  double m00, m01, m02, m11, m12, m22;
-} symm_matrix;
+inline cvector3 cvector3_scale(number s, cvector3 v) {
+  return make_cvector3(vector3_scale(s, cvector3_re(v)), vector3_scale(s, cvector3_im(v)));
+}
+
+inline cvector3 cvector_zero() { return make_cvector3(make_vector3(), make_vector3()); }
+
+inline cvector3 cvector_add(cvector3 cv1, cvector3 cv2) {
+  return make_cvector3(vector3_plus(cvector3_re(cv1), cvector3_re(cv2)),
+                       vector3_plus(cvector3_im(cv1), cvector3_im(cv2)));
+}
+
+struct symm_matrix {
+  duals::duald m00, m01, m02, m11, m12, m22;
+  struct symm_matrix &operator+=(const symm_matrix &rhs) {
+    m00 += rhs.m00;
+    m11 += rhs.m11;
+    m22 += rhs.m22;
+    m01 += rhs.m01;
+    m02 += rhs.m02;
+    m12 += rhs.m12;
+    return *this;
+  }
+  struct symm_matrix &operator+=(const duals::duald &k) {
+    m00 += k;
+    m11 += k;
+    m22 += k;
+    return *this;
+  }
+  struct symm_matrix &operator+(const symm_matrix &rhs) {
+    m00 += rhs.m00;
+    m11 += rhs.m11;
+    m22 += rhs.m22;
+    m01 += rhs.m01;
+    m02 += rhs.m02;
+    m12 += rhs.m12;
+    return *this;
+  }
+  struct symm_matrix &operator*(const duals::duald &k) {
+    m00 *= k;
+    m11 *= k;
+    m22 *= k;
+    m01 *= k;
+    m02 *= k;
+    m12 *= k;
+    return *this;
+  }
+  struct symm_matrix &operator/(const duals::duald &k) {
+    m00 /= k;
+    m11 /= k;
+    m22 /= k;
+    m01 /= k;
+    m02 /= k;
+    m12 /= k;
+    return *this;
+  }
+  struct symm_matrix &operator-() {
+    m00 = -m00;
+    m11 = -m11;
+    m22 = -m22;
+    m01 = -m01;
+    m02 = -m02;
+    m12 = -m12;
+    return *this;
+  }
+};
 
 struct pol {
   meep_geom::susceptibility user_s;
@@ -173,6 +235,7 @@ class geom_epsilon : public meep::material_function {
 
 public:
   double u_p = 0;
+  bool use_anisotropic_averaging;
   geom_box_tree geometry_tree;
   geom_box_tree restricted_tree;
   geometric_object_list geometry;
@@ -206,6 +269,8 @@ class geom_epsilon : public meep::material_function {
   virtual double chi1p1(meep::field_type ft, const meep::vec &r);
   virtual void eff_chi1inv_row(meep::component c, double chi1inv_row[3], const meep::volume &v,
                                double tol, int maxeval);
+  void eff_chi1inv_row_grad(meep::component c, double chi1inv_row[3], const meep::volume &v,
+                            double tol, int maxeval, bool needs_grad);
 
   void eff_chi1inv_matrix(meep::component c, symm_matrix *chi1inv_matrix, const meep::volume &v,
                           double tol, int maxeval, bool &fallback);
@@ -221,14 +286,18 @@ class geom_epsilon : public meep::material_function {
 
 private:
   material_type_list extra_materials;
+  void eval_material_pt(material_type &material, vector3 p);
   pol *current_pol;
 };
 
 void set_dimensions(int dims);
+
 geom_epsilon *make_geom_epsilon(meep::structure *s, geometric_object_list *g,
                                 vector3 center = make_vector3(), bool ensure_periodicity = false,
                                 material_type _default_material = vacuum,
-                                material_type_list extra_materials = material_type_list());
+                                material_type_list extra_materials = material_type_list(),
+                                bool use_anisotropic_averaging = false);
+
 //, geometric_object_list g, material_type_list extra_materials
 void set_materials_from_geometry(meep::structure *s, geometric_object_list g,
                                  vector3 center = make_vector3(),
@@ -266,8 +335,6 @@ bool is_material_grid(material_type mt);
 bool is_material_grid(void *md);
 bool is_variable(material_type mt);
 bool is_variable(void *md);
-bool is_file(material_type md);
-bool is_file(void *md);
 bool is_medium(material_type md, medium_struct **m);
 bool is_medium(void *md, medium_struct **m);
 bool is_metal(meep::field_type ft, const material_type *material);
@@ -284,10 +351,8 @@ void init_libctl(material_type default_mat, bool ensure_per, meep::grid_volume *
 // material grid functions
 /***************************************************************/
 void update_weights(material_type matgrid, double *weights);
-meep::vec matgrid_grad(vector3 p, geom_box_tree tp, int oi, material_data *md);
-meep::vec material_grid_grad(vector3 p, material_data *md, const geometric_object *o);
-double matgrid_val(vector3 p, geom_box_tree tp, int oi, material_data *md);
-double material_grid_val(vector3 p, material_data *md);
+duals::duald matgrid_val(vector3 p, geom_box_tree tp, int oi, material_data *md);
+duals::duald material_grid_val(vector3 p, material_data *md);
 geom_box_tree calculate_tree(const meep::volume &v, geometric_object_list g);
 void material_grids_addgradient(double *v, size_t ng, size_t nf,
                                 std::vector<meep::dft_fields *> fields_a,
diff --git a/src/support/Makefile.am b/src/support/Makefile.am
index f65ba7ddb..603b0c461 100644
--- a/src/support/Makefile.am
+++ b/src/support/Makefile.am
@@ -1,3 +1,3 @@
 noinst_LTLIBRARIES = libsupport.la
 
-libsupport_la_SOURCES = mt19937ar.c meep_mt.h
+libsupport_la_SOURCES = mt19937ar.c meep_mt.h dual.hpp
diff --git a/src/support/dual.hpp b/src/support/dual.hpp
new file mode 100644
index 000000000..8c18d8714
--- /dev/null
+++ b/src/support/dual.hpp
@@ -0,0 +1,1459 @@
+//===-- duals/dual - Dual number class --------------------------*- C++ -*-===//
+//
+// Part of the cppduals project.
+// https://tesch1.gitlab.io/cppduals
+//
+// (c)2019 Michael Tesch. tesch1@gmail.com
+//
+// See https://gitlab.com/tesch1/cppduals/blob/master/LICENSE.txt for
+// license information.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef CPPDUALS_DUAL
+#define CPPDUALS_DUAL
+
+#ifndef PARSED_BY_DOXYGEN
+#include <cmath>
+#include <ctgmath>
+#include <limits>
+#include <type_traits>
+#include <complex>
+#include <random>
+#include <iostream>
+#endif
+
+#if !defined(CPPDUALS_IGNORE_COMPILER_VERSION) && !defined(_WIN32)
+#if __cplusplus < 201103L
+#error CPPDUALS needs at least a C++11 compliant compiler
+#endif
+#endif
+
+/// Configure whether system has POSIX extern int signgam;
+#ifndef CPPDUALS_HAVE_SIGNGAM
+#ifndef _WIN32
+#define CPPDUALS_HAVE_SIGNGAM 1
+#endif
+#endif
+namespace duals {
+
+/**
+\file       dual
+\brief      Dual number class.
+
+\mainpage   cppduals
+
+\ref duals/dual is a single-header [Dual
+number](https://en.wikipedia.org/wiki/Dual_number) C++ template
+library that implements numbers of the type \f$(a + b \cdot
+\epsilon)\f$ where \f$ \epsilon \ne 0\f$, and \f$\epsilon^2 = 0\f$.
+
+`duals::dual<>` can be used for "automatic" differentiation, it can
+also recursively nest with itself for higher orders of differentiation
+(ie for the second derivative use `duals::dual<duals::dual<T>>`).  It
+can also be used to differentiate parts of complex functions as
+`std::complex<duals::dual<T>>`.  This file can be used stand-alone for
+dual number support, but for Eigen vectorization support rather the
+file \ref duals/dual_eigen should be included.
+
+```
+#include <duals/dual>
+
+using namespace duals::literals;
+
+template <class T> T   f(T x) { return pow(x,pow(x,x)); }
+template <class T> T  df(T x) { return pow(x,-1. + x + pow(x,x)) * (1. + x*log(x) +
+x*pow(log(x),2.)); } template <class T> T ddf(T x) { return (pow(x,pow(x,x)) * pow(pow(x,x - 1.) +
+pow(x,x)*log(x)*(log(x) + 1.), 2.) + pow(x,pow(x,x)) * (pow(x,x - 1.) * log(x) + pow(x,x - 1.) *
+(log(x) + 1.) + pow(x,x - 1.) * ((x - 1.)/x + log(x)) + pow(x,x) * log(x) * pow(log(x) + 1., 2.) ));
+}
+
+int main()
+{
+  std::cout << "  f(2.)            = " << f(2.)    << "\n";
+  std::cout << " df(2.)            = " << df(2.)   << "\n";
+  std::cout << "ddf(2.)            = " << ddf(2.)  << "\n";
+  std::cout << "  f(2+1_e)         = " << f(2+1_e) << "\n";
+  std::cout << "  f(2+1_e).dpart() = " << f(2+1_e).dpart() << "\n";
+  duals::hyperduald x(2+1_e,1+0_e);
+  std::cout << "  f((2+1_e) + (1+0_e)_e).dpart().dpart() = " << f(x).dpart().dpart() << "\n";
+}
+```
+Produces (notice the derivative in the dual-part):
+```
+  f(2.)            = 16
+ df(2.)            = 107.11
+ddf(2.)            = 958.755
+  f(2+1_e)         = (16+107.11_e)
+  f(2+1_e).dpart() = 107.11
+  f((2+1_e) + (1+0_e)_e).dpart().dpart() = 958.755
+```
+
+How this works can be seen by inspecting the infinite [Taylor
+series](https://en.wikipedia.org/wiki/Taylor_series) expansion of a
+function \f$ f(x) \f$ at \f$ a \f$ with \f$ x = a + b\epsilon \f$.
+The series truncates itself due to the property \f$ \epsilon^2 = 0
+\f$, leaving the function's derivative at \f$ a \f$ in the "dual part"
+of the result (when \f$ b = 1 \f$):
+
+\f[
+\begin{split}
+f(a + b \epsilon) &= f(a) + f'(a)(b \epsilon) + \frac{f''(a)}{2!}(b \epsilon)^2
++ \frac{f'''(a)}{3!}(b \epsilon)^3 + \ldots   \\        \
+&= f(a) + f'(a)(b \epsilon)
+\end{split}
+\f]
+
+The class is contained in a single c++11 header `#include
+<duals/dual>`, for extended [Eigen](http://eigen.tuxfamily.org)
+support, instead include the header \ref duals/dual_eigen "#include <duals/dual_eigen>".
+
+Type X in the templates below can be any value which can be
+assigned to value_type.
+
+Type X also indicates a limitation to dual numbers of the same depth
+but (possibly) different value_type as `duals::dual<T>`.  For example,
+you can assign (or add/sub/mul/div) `duals::dual<float>` and
+`duals::dual<double>`, but you can not assign
+`duals::dual<duals::dual<float>>` to `duals::dual<float>`.
+
+Here is a synopsis of the class:
+
+```
+namespace duals {
+
+template<class T> class dual {
+
+    typedef T value_type;
+
+    dual(const & re = T(), const & du = T());
+    dual(const dual &);
+    template<class X> dual(const dual<X> &);
+
+    T rpart() const;
+    T dpart() const;
+
+    void rpart(T);
+    void dpart(T);
+
+    dual<T> operator-() const;
+    dual<T> operator+() const;
+
+    dual<T> & operator= (const T &);
+    dual<T> & operator+=(const T &);
+    dual<T> & operator-=(const T &);
+    dual<T> & operator*=(const T &);
+    dual<T> & operator/=(const T &);
+
+    dual<T> & operator=(const dual<T> &);
+    template<class X> dual<T> & operator= (const dual<X> &);
+    template<class X> dual<T> & operator+=(const dual<X> &);
+    template<class X> dual<T> & operator-=(const dual<X> &);
+    template<class X> dual<T> & operator*=(const dual<X> &);
+    template<class X> dual<T> & operator/=(const dual<X> &);
+
+    // The comparison operators are not strictly well-defined,
+    // they are implemented as comparison of the real part.
+
+    bool operator ==(const X &b) const;
+    bool operator !=(const X &b) const;
+
+    bool operator <(const X &b) const;
+    bool operator >(const X &b) const;
+    bool operator <=(const X &b) const;
+    bool operator >=(const X &b) const;
+    bool operator <(const dual<X> &b) const;
+    bool operator >(const dual<X> &b) const;
+    bool operator <=(const dual<X> &b) const;
+    bool operator >=(const dual<X> &b) const;
+
+};
+
+// Non-member functions:
+
+T rpart(dual<T>)        // Real part
+T dpart(dual<T>)        // Dual part
+dual<T> dconj(dual<T>)  // Dual-conjugate
+
+dual<T> random(dual<T> low = {0,0}, dual<T> high = {1,0})
+
+dual<T> exp(dual<T>)
+dual<T> log(dual<T>)
+dual<T> log10(dual<T>)
+dual<T> pow(dual<T>, U)
+dual<T> pow(U, dual<T>)
+dual<T> pow(dual<T>, dual<T>)
+dual<T> sqrt(dual<T>)
+dual<T> cbrt(dual<T>)
+dual<T> sin(dual<T>)
+dual<T> cos(dual<T>)
+dual<T> tan(dual<T>)
+dual<T> asin(dual<T>)
+dual<T> acos(dual<T>)
+dual<T> atan(dual<T>)
+
+// TODO:
+dual<T> sinh(dual<T>)
+dual<T> cosh(dual<T>)
+dual<T> tanh(dual<T>)
+dual<T> asinh(dual<T>)
+dual<T> acosh(dual<T>)
+dual<T> atanh(dual<T>)
+
+// Non-differentiable operations on the real part.
+T frexp(duals::dual<T> arg, int* exp );
+duals::dual<T> ldexp(duals::dual<T> arg, int exp );
+T trunc(duals::dual<T> d);
+T floor(duals::dual<T> d);
+T ceil(duals::dual<T> d);
+T round(duals::dual<T> d);
+int fpclassify(duals::dual<T> d);
+bool isfinite(duals::dual<T> d);
+bool isnormal(duals::dual<T> d);
+bool isinf(duals::dual<T> d);
+bool isnan(duals::dual<T> d);
+
+// Stream IO
+template<T> operator>>(basic_istream<charT, traits> &, dual<T> &);
+template<T> operator<<(basic_ostream<charT, traits> &, const dual<T> &);
+
+}
+```
+
+Some useful typedefs:
+
+```
+typedef dual<float> dualf;
+typedef dual<double> duald;
+typedef dual<long double> dualld;
+typedef dual<dualf> hyperdualf;
+typedef dual<duald> hyperduald;
+typedef dual<dualld> hyperdualld;
+typedef std::complex<dualf> cdualf;
+typedef std::complex<duald> cduald;
+typedef std::complex<dualld> cdualld;
+```
+
+There are also literals for dual parts defined in the inline
+namespace duals::literals.  They are available with `using
+namespace duals;` or `using namespace duals::literals`.
+
+```
+using namespace duals::literals;
+dualf  x = 3 + 4_ef;
+duald  y = 3 + 4_e;
+dualld z = 3 + 4_el;
+```
+
+And in case you dislike iostreams, there are some formatters for the
+[`{fmt}`](https://github.com/fmtlib/fmt) formatting library.  These
+are disabled by default, but can be enabled by `#define
+CPPDUALS_LIBFMT` for the `dual<>` formatter, and/or `#define
+CPPDUALS_LIBFMT_COMPLEX` for the std::complex<> formatter. There are
+three custom formatting flags that control how the dual numbers are
+printed, these must come before the normal `{fmt}` formatting spec:
+'$', '*', ','.  For me these are about 3x faster than iostreams.
+
+```
+#define CPPDUALS_LIBFMT
+#define CPPDUALS_LIBFMT_COMPLEX
+#inlcude <duals/dual>
+using namespace duals::literals;
+  ...
+  string s = fmt::format("{:}",  1 + 2_e); // s = "(1.0+2.0_e)"
+  string s = fmt::format("{:g}", 1 + 2_e); // s = "(1+2_e)"
+  string s = fmt::format("{:*}", 1 + 2_e); // s = "(1.0+2.0*e)"
+  string s = fmt::format("{:,}", 1 + 2_e); // s = "(1.0,2.0)"
+  string s = fmt::format("{:*,g}", complexd(1,2_e)); // s = "((1)+(0,2)*i)"
+```
+
+The "archaic Greek epsilon" logo is from [Wikimedia
+commons](https://commons.wikimedia.org/wiki/File:Greek_Epsilon_archaic.svg)
+
+Some casual reading material:
+
+- [ON QUATERNIONS, William Rowan Hamilton, Proceedings of the Royal Irish Academy, 3 (1847), pp.
+1–16.](https://www.maths.tcd.ie/pub/HistMath/People/Hamilton/Quatern2/)
+- [Basic Space-Time Transformations Expressed by Means of Two-Component Number
+Systems](https://doi.org/10.12693%2Faphyspola.86.291)
+
+ */
+
+#ifndef PARSED_BY_DOXYGEN
+template <class T> class dual;
+#endif
+
+/// Check if T is dual<>, match non-duals.
+template <class T> struct is_dual : std::false_type {};
+
+#ifndef PARSED_BY_DOXYGEN
+
+/// Check if T is dual<>, match dual<>.
+template <class T> struct is_dual<dual<T> > : std::true_type {};
+
+#endif
+
+/// Check if T is std::complex<>, match non- std::complex<>.
+template <class T> struct is_complex : std::false_type {};
+
+#ifndef PARSED_BY_DOXYGEN
+
+/// Check if T is std::complex<>, match std::complex<>.
+template <class T> struct is_complex<std::complex<T> > : std::true_type {};
+
+#endif
+
+/// Dual_traits helper class.
+template <class T> struct dual_traits {
+  /// Depth of T - for T=scalar this is 0. for dual_traits<double> it
+  /// is 1.
+  enum { depth = 0 }; // -Wenum-compare
+
+  /// The real storage type.
+  typedef T real_type;
+};
+
+#ifndef PARSED_BY_DOXYGEN
+
+/// dual_traits for dual<> types
+template <class T> struct dual_traits<dual<T> > {
+  /// Depth to which this dual<> type is nested.  One (1) is a
+  /// first-level dual, whereas non-duals have a depth of 0.
+  enum { depth = dual_traits<T>::depth + 1 };
+
+  /// The real storage type.
+  typedef typename dual_traits<T>::real_type real_type;
+};
+
+template <class T> struct dual_traits<std::complex<dual<T> > > {
+  /// complex<dual<T>> have the same 'depth' as their dual.
+  enum { depth = dual_traits<T>::depth };
+
+  /// The real storage type.
+  typedef typename dual_traits<T>::real_type real_type;
+};
+
+namespace detail {
+
+template <class T> struct Void { typedef void type; };
+template <class T, class U = void> struct has_member_type : std::false_type {};
+template <class T>
+struct has_member_type<T, typename Void<typename T::type>::type> : std::true_type {
+  struct wrap {
+    typedef typename T::type type;
+    typedef typename T::type ReturnType;
+  };
+};
+
+} // namespace detail
+
+/// Promote two types - default according to common_type
+template <class T, class U, class V = void> struct promote : std::common_type<T, U> {};
+
+/// Can types A and B be promoted to a common type?
+template <class A, class B> using can_promote = detail::has_member_type<promote<A, B> >;
+
+template <class T, class U>
+struct promote<
+    dual<T>, dual<U>,
+    typename std::enable_if<(can_promote<T, U>::value &&
+                             (int)dual_traits<T>::depth == (int)dual_traits<U>::depth)>::type> {
+  typedef dual<typename promote<U, T>::type> type;
+};
+template <class T, class U>
+struct promote<dual<T>, U,
+               typename std::enable_if<(can_promote<T, U>::value &&
+                                        (int)dual_traits<T>::depth >= (int)dual_traits<U>::depth &&
+                                        !is_complex<U>::value)>::type> {
+  typedef dual<typename promote<U, T>::type> type;
+};
+template <class T, class U>
+struct promote<U, dual<T>,
+               typename std::enable_if<(can_promote<T, U>::value &&
+                                        (int)dual_traits<T>::depth >= (int)dual_traits<U>::depth &&
+                                        !is_complex<U>::value)>::type> {
+  typedef dual<typename promote<U, T>::type> type;
+};
+// /////////////////////////////////////////////////
+template <class T, class U>
+struct promote<std::complex<T>, std::complex<U>,
+               typename std::enable_if<(can_promote<T, U>::value &&
+                                        (is_dual<T>::value || is_dual<U>::value))>::type> {
+  typedef std::complex<typename promote<T, U>::type> type;
+};
+template <class T, class U>
+struct promote<
+    std::complex<T>, U,
+    typename std::enable_if<(can_promote<T, U>::value && (is_dual<T>::value || is_dual<U>::value) &&
+                             !is_complex<U>::value)>::type> {
+  typedef std::complex<typename promote<T, U>::type> type;
+};
+template <class T, class U>
+struct promote<
+    U, std::complex<T>,
+    typename std::enable_if<(can_promote<T, U>::value && (is_dual<T>::value || is_dual<U>::value) &&
+                             !is_complex<U>::value)>::type> {
+  typedef std::complex<typename promote<T, U>::type> type;
+};
+// /////////////////////////////////////////////////
+
+#endif // PARSED_BY_DOXYGEN
+
+} // namespace duals
+
+#ifndef PARSED_BY_DOXYGEN
+
+#define NOMACRO // thwart macroification
+#ifdef EIGEN_PI
+#define MY_PI EIGEN_PI
+#else
+#define MY_PI M_PI
+#endif
+
+#endif // PARSED_BY_DOXYGEN
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#endif
+
+#ifdef CPPDUALS_ENABLE_STD_IS_ARITHMETIC
+
+/// Duals are as arithmetic as their value_type is arithmetic.
+template <class T> struct is_arithmetic<duals::dual<T> > : is_arithmetic<T> {};
+
+#endif // CPPDUALS_ENABLE_IS_ARITHMETIC
+
+/// Duals are compound types.
+template <class T> struct is_compound<duals::dual<T> > : true_type {};
+
+// Modification of std::numeric_limits<> per
+// C++03 17.4.3.1/1, and C++11 18.3.2.3/1.
+template <class T> struct numeric_limits<duals::dual<T> > : numeric_limits<T> {
+  static constexpr bool is_specialized = true;
+  static constexpr duals::dual<T> min NOMACRO() { return numeric_limits<T>::min NOMACRO(); }
+  static constexpr duals::dual<T> lowest() { return numeric_limits<T>::lowest(); }
+  static constexpr duals::dual<T> max NOMACRO() { return numeric_limits<T>::max NOMACRO(); }
+  static constexpr duals::dual<T> epsilon() { return numeric_limits<T>::epsilon(); }
+  static constexpr duals::dual<T> round_error() { return numeric_limits<T>::round_error(); }
+  static constexpr duals::dual<T> infinity() { return numeric_limits<T>::infinity(); }
+  static constexpr duals::dual<T> quiet_NaN() { return numeric_limits<T>::quiet_NaN(); }
+  static constexpr duals::dual<T> signaling_NaN() { return numeric_limits<T>::signaling_NaN(); }
+  static constexpr duals::dual<T> denorm_min() { return numeric_limits<T>::denorm_min(); }
+};
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+} // namespace std
+#endif
+
+namespace duals {
+
+#ifndef PARSED_BY_DOXYGEN
+
+// T and X are wrapped in a dual<>
+#define CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)                                                        \
+  typename std::enable_if<(int)duals::dual_traits<X>::depth == (int)duals::dual_traits<T>::depth,  \
+                          int>::type = 0,                                                          \
+                          typename std::enable_if<can_promote<T, X>::value, int>::type = 0
+
+// Both T and U are wrapped in a dual<>
+#define CPPDUALS_ENABLE_SAME_DEPTH_AND_COMMON_T(T, U)                                              \
+  typename std::enable_if<(int)duals::dual_traits<T>::depth == (int)duals::dual_traits<U>::depth,  \
+                          int>::type = 0,                                                          \
+                          typename std::enable_if<can_promote<T, U>::value, int>::type = 0,        \
+                          typename common_t = dual<typename duals::promote<T, U>::type>
+
+// T is wrapped in a dual<>, U may or may not be.
+#define CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U)                                               \
+  typename std::enable_if<((int)duals::dual_traits<T>::depth >=                                    \
+                           (int)duals::dual_traits<U>::depth),                                     \
+                          int>::type = 0,                                                          \
+                          typename std::enable_if<can_promote<dual<T>, U>::value, int>::type = 0,  \
+                          typename common_t = typename duals::promote<dual<T>, U>::type
+
+// T is wrapped in complex<dual<>>
+#define CPPDUALS_ENABLE_LEQ_DEPTH_AND_CX_COMMON_T(T, U)                                            \
+  typename std::enable_if<                                                                         \
+      ((int)duals::dual_traits<T>::depth >= (int)duals::dual_traits<U>::depth),                    \
+      int>::type = 0,                                                                              \
+      typename std::enable_if<can_promote<std::complex<dual<T> >, U>::value, int>::type = 0,       \
+      typename common_t = typename duals::promote<std::complex<dual<T> >, U>::type
+
+#define CPPDUALS_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
+
+#endif
+
+/*! \page user Background
+  TODO: Add text here...
+*/
+
+/// Abstract dual number class.  Can nest with other dual numbers and
+/// complex numbers.
+template <class T> class dual {
+public:
+  /// Class type of rpart() and dpart().  This type can be nested
+  /// dual<> or std::complex<>.
+  typedef T value_type;
+
+private:
+  /// The real part.
+  value_type _real;
+  /// The dual part.
+  value_type _dual;
+
+public:
+  /// Construct dual from optional real and dual parts.
+  constexpr dual(const value_type re = value_type(), const value_type du = value_type())
+      : _real(re), _dual(du) {}
+
+  /// Copy construct from a dual of equal depth.
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X), CPPDUALS_ENABLE_IF(!is_complex<X>::value)>
+  dual(const dual<X> &x) : _real(x.rpart()), _dual(x.dpart()) {}
+
+  /// Cast to a complex<dual<>> with real part equal to *this.
+  operator std::complex<dual<T> >() { return std::complex<dual<T> >(*this); }
+
+  /// Explicit cast to an arithmetic type retains the rpart()
+  template <class X, CPPDUALS_ENABLE_IF(std::is_arithmetic<X>::value && !is_dual<X>::value)>
+  explicit operator X() const {
+    return X(_real);
+  }
+
+  /// Get the real part.
+  T rpart() const { return _real; }
+
+  /// Get the dual part.
+  T dpart() const { return _dual; }
+
+  /// Set the real part.
+  void rpart(value_type re) { _real = re; }
+
+  /// Get the dual part.
+  void dpart(value_type du) { _dual = du; }
+
+  /// Unary negation
+  dual<T> operator-() const { return dual<T>(-_real, -_dual); }
+
+  /// Unary nothing
+  dual<T> operator+() const { return *this; }
+
+  /// Assignment of `value_type` assigns the real part and zeros the dual part.
+  dual<T> &operator=(const T &x) {
+    _real = x;
+    _dual = value_type();
+    return *this;
+  }
+
+  /// Add a relatively-scalar to this dual.
+  dual<T> &operator+=(const T &x) {
+    _real += x;
+    return *this;
+  }
+
+  /// Subtract a relatively-scalar from this dual.
+  dual<T> &operator-=(const T &x) {
+    _real -= x;
+    return *this;
+  }
+
+  /// Multiply a relatively-scalar with this dual.
+  dual<T> &operator*=(const T &x) {
+    _real *= x;
+    _dual *= x;
+    return *this;
+  }
+
+  /// Divide this dual by relatively-scalar.
+  dual<T> &operator/=(const T &x) {
+    _real /= x;
+    _dual /= x;
+    return *this;
+  }
+
+  // dua & operator=(const dual & x)  { _real =  x.rpart(); _dual =  x.dpart(); }
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)> dual<T> &operator=(const dual<X> &x) {
+    _real = x.rpart();
+    _dual = x.dpart();
+    return *this;
+  }
+
+  /// Add a dual of the same depth to this dual.
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)> dual<T> &operator+=(const dual<X> &x) {
+    _real += x.rpart();
+    _dual += x.dpart();
+    return *this;
+  }
+
+  /// Subtract a dual of the same depth from this dual.
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)> dual<T> &operator-=(const dual<X> &x) {
+    _real -= x.rpart();
+    _dual -= x.dpart();
+    return *this;
+  }
+
+  /// Multiply this dual with a dual of same depth.
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)> dual<T> &operator*=(const dual<X> &x) {
+    _dual = _real * x.dpart() + _dual * x.rpart();
+    _real = _real * x.rpart();
+    return *this;
+  }
+
+  /// Divide this dual by another dual of the same or lower depth.
+  template <class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T, X)> dual<T> &operator/=(const dual<X> &x) {
+    _dual = (_dual * x.rpart() - _real * x.dpart()) / (x.rpart() * x.rpart());
+    _real = _real / x.rpart();
+    return *this;
+  }
+
+  // The following comparison operators are not strictly well-defined,
+  // they are implemented as comparison of the real parts.
+#if 0
+  /// Compare this dual with another dual, comparing real parts for equality.
+  template<class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T,X)>
+  bool operator ==(const dual<X> &b) const { return _real == b.rpart(); }
+
+  /// Compare this dual with another dual, comparing real parts for inequality.
+  template<class X, CPPDUALS_ONLY_SAME_DEPTH_AS_T(T,X)>
+  bool operator !=(const dual<X> &b) const { return _real != b.rpart(); }
+
+  /// Compare real part against real part of b
+  template<class X, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T,X)>
+  bool operator <(const dual<X> &b) const { return _real < b.rpart(); }
+
+  /// Compare real part against real part of b
+  template<class X, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T,X)>
+  bool operator >(const dual<X> &b) const { return _real > b.rpart(); }
+
+  /// Compare real part against real part of b
+  template<class X, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T,X)>
+  bool operator <=(const dual<X> &b) const { return _real <= b.rpart(); }
+
+  /// Compare real part against real part of b
+  template<class X, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T,X)>
+  bool operator >=(const dual<X> &b) const { return _real >= b.rpart(); }
+#endif
+};
+
+/// Get the dual's real part.
+template <class T> T rpart(const dual<T> &x) { return x.rpart(); }
+
+/// Get the dual's dual part.
+template <class T> T dpart(const dual<T> &x) { return x.dpart(); }
+
+/// R-part of complex<dual<>> is non-dual complex<> (not to be confused with real())
+template <class T> std::complex<T> rpart(const std::complex<dual<T> > &x) {
+  return std::complex<T>(x.real().rpart(), x.imag().rpart());
+}
+
+/// Dual part of complex<dual<>> is complex<>
+template <class T> std::complex<T> dpart(const std::complex<dual<T> > &x) {
+  return std::complex<T>(x.real().dpart(), x.imag().dpart());
+}
+
+/// Get a non-dual's real part.
+template <class T,
+          CPPDUALS_ENABLE_IF((std::is_arithmetic<T>::value && !std::is_compound<T>::value) ||
+                             is_complex<T>::value)>
+T rpart(const T &x) {
+  return x;
+}
+
+/// Get a non-dual's dual part := zero.
+template <class T,
+          CPPDUALS_ENABLE_IF((std::is_arithmetic<T>::value && !std::is_compound<T>::value) ||
+                             is_complex<T>::value)>
+T dpart(const T &) {
+  return T(0);
+}
+
+#ifndef PARSED_BY_DOXYGEN
+
+/// Dual +-*/ ops with another entity
+#define CPPDUALS_BINARY_OP(op)                                                                     \
+  template <class T, class U, CPPDUALS_ENABLE_SAME_DEPTH_AND_COMMON_T(T, U)>                       \
+  common_t operator op(const dual<T> &z, const dual<U> &w) {                                       \
+    common_t x(z);                                                                                 \
+    return x op## = w;                                                                             \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U),                        \
+            CPPDUALS_ENABLE_IF(!std::is_same<U, std::complex<dual<T> > >::value)>                  \
+  common_t operator op(const dual<T> &z, const U &w) {                                             \
+    common_t x(z);                                                                                 \
+    return x op## = w;                                                                             \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U),                        \
+            CPPDUALS_ENABLE_IF(!std::is_same<U, std::complex<dual<T> > >::value)>                  \
+  common_t operator op(const U &z, const dual<T> &w) {                                             \
+    common_t x(z);                                                                                 \
+    return x op## = w;                                                                             \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_CX_COMMON_T(T, U),                     \
+            CPPDUALS_ENABLE_IF(!std::is_same<U, std::complex<dual<T> > >::value)>                  \
+  common_t operator op(const std::complex<dual<T> > &z, const U &w) {                              \
+    common_t x(z);                                                                                 \
+    return x op## = w;                                                                             \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_CX_COMMON_T(T, U),                     \
+            CPPDUALS_ENABLE_IF(!std::is_same<U, std::complex<dual<T> > >::value)>                  \
+  common_t operator op(const U &z, const std::complex<dual<T> > &w) {                              \
+    common_t x(z);                                                                                 \
+    return x op## = w;                                                                             \
+  }
+
+CPPDUALS_BINARY_OP(+)
+CPPDUALS_BINARY_OP(-)
+CPPDUALS_BINARY_OP(*)
+CPPDUALS_BINARY_OP(/)
+
+/// Dual compared to a non-complex lower rank thing
+#define CPPDUALS_LHS_COMPARISON(op)                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_SAME_DEPTH_AND_COMMON_T(T, U)>                       \
+  bool operator op(const dual<T> &a, const dual<U> &b) {                                           \
+    return a.rpart() op b.rpart();                                                                 \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U),                        \
+            CPPDUALS_ENABLE_IF(!is_complex<U>::value)>                                             \
+  bool operator op(const U &a, const dual<T> &b) {                                                 \
+    return a op b.rpart();                                                                         \
+  }                                                                                                \
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U),                        \
+            CPPDUALS_ENABLE_IF(!is_complex<U>::value)>                                             \
+  bool operator op(const dual<T> &a, const U &b) {                                                 \
+    return a.rpart() op b;                                                                         \
+  }
+
+CPPDUALS_LHS_COMPARISON(<)
+CPPDUALS_LHS_COMPARISON(>)
+CPPDUALS_LHS_COMPARISON(<=)
+CPPDUALS_LHS_COMPARISON(>=)
+CPPDUALS_LHS_COMPARISON(==)
+CPPDUALS_LHS_COMPARISON(!=)
+
+#endif // PARSED_BY_DOXYGEN
+
+} // namespace duals
+
+// some useful functions of solely the real part
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#endif
+
+#ifndef PARSED_BY_DOXYGEN
+
+#define make_math(T)                                                                               \
+  inline T(frexp)(const duals::dual<T> &arg, int *exp) { return (frexp)(arg.rpart(), exp); }       \
+  inline duals::dual<T>(ldexp)(const duals::dual<T> &arg, int exp) {                               \
+    return arg * std::pow((T)2, exp);                                                              \
+  }                                                                                                \
+  inline T(trunc)(const duals::dual<T> &d) { return (trunc)(d.rpart()); }                          \
+  inline T(floor)(const duals::dual<T> &d) { return (floor)(d.rpart()); }                          \
+  inline T(ceil)(const duals::dual<T> &d) { return (ceil)(d.rpart()); }                            \
+  inline T(round)(const duals::dual<T> &d) { return (round)(d.rpart()); }                          \
+  inline int(fpclassify)(const duals::dual<T> &d) { return (fpclassify)(d.rpart()); }              \
+  inline bool(isfinite)(const duals::dual<T> &d) { return (isfinite)(d.rpart()); }                 \
+  inline bool(isnormal)(const duals::dual<T> &d) { return (isnormal)(d.rpart()); }                 \
+  inline bool(isinf)(const duals::dual<T> &d) { return (isinf)(d.rpart()); }                       \
+  inline bool(isnan)(const duals::dual<T> &d) { return (isnan)(d.rpart()); }
+
+make_math(float) make_math(double) make_math(long double)
+
+#undef make_math
+
+#endif // PARSED_BY_DOXYGEN
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+    _LIBCPP_END_NAMESPACE_STD
+#else
+} // namespace std
+#endif
+    namespace duals {
+
+  namespace randos {
+
+  // Random real value between a and b.
+  template <class T, typename dist = std::uniform_real_distribution<T>,
+            CPPDUALS_ENABLE_IF(!is_complex<T>::value && !is_dual<T>::value)>
+  T random(T a = T(0), T b = T(1)) {
+    static std::default_random_engine generator;
+    dist distribution(a, b);
+    return distribution(generator);
+  }
+
+  template <class T, typename dist = std::uniform_real_distribution<T>,
+            CPPDUALS_ENABLE_IF(!is_complex<T>::value && !is_dual<T>::value)>
+  T random2(T a = T(0), T b = T(1)) {
+    return random<T, dist>(a, b);
+  }
+
+  // Helper class for testing - also random value in dual part.
+  template <class DT, CPPDUALS_ENABLE_IF(is_dual<DT>::value)>
+  DT random2(DT a = DT(0, 0), DT b = DT(1, 1)) {
+    using randos::random;
+    return DT(a.rpart() + random2<typename DT::value_type>() * (b.rpart() - a.rpart()),
+              a.dpart() + random2<typename DT::value_type>() * (b.dpart() - a.dpart()));
+  }
+
+  // Helper class for testing - also random value in dual part of the complex.
+  template <class CT, CPPDUALS_ENABLE_IF(is_complex<CT>::value)>
+  CT random2(CT a = CT(0, 0), CT b = CT(1, 1)) {
+    return CT(a.real() + random2<typename CT::value_type>() * (b.real() - a.real()),
+              a.imag() + random2<typename CT::value_type>() * (b.imag() - a.imag()));
+  }
+
+  } // namespace randos
+
+  /// Random real and dual parts, used by Eigen's Random(), by default
+  // the returned value has zero dual part and is in the range [0+0_e,
+  // 1+0_e].
+  template <class DT, typename dist = std::uniform_real_distribution<typename DT::value_type>,
+            CPPDUALS_ENABLE_IF(is_dual<DT>::value)>
+  DT random(DT a = DT(0, 0), DT b = DT(1, 0)) {
+    using randos::random;
+    return DT(random<typename DT::value_type, dist>(a.rpart(), b.rpart()),
+              random<typename DT::value_type, dist>(a.dpart(), b.dpart()));
+  }
+
+  /// Complex Conjugate of a dual is just the dual.
+  template <class T> dual<T> conj(const dual<T> &x) { return x; }
+
+  /// Conjugate a thing that's not dual and not complex -- it has no
+  /// complex value so just return it.  This is different from
+  /// std::conj() which promotes the T to a std::complex<T>.
+  template <class T, CPPDUALS_ENABLE_IF(!is_dual<T>::value && !is_complex<T>::value &&
+                                        std::is_arithmetic<T>::value)>
+  T conj(const T &x) {
+    return x;
+  }
+
+  /// Dual Conjugate, such that x*dconj(x) = rpart(x)^2.  Different
+  /// function name from complex conjugate conj().
+  template <class T> dual<T> dconj(const dual<T> &x) { return dual<T>(x.rpart(), -x.dpart()); }
+
+  /// Dual Conjugate of a complex, such that x*dconj(x) = rpart(x)^2.
+  /// Different function name from complex conjugate conj().
+  template <class T> std::complex<T> dconj(const std::complex<T> &x) {
+    return std::complex<T>(dconj(x.real()), dconj(x.imag()));
+  }
+
+  /// Conjugate a thing that's not dual and not complex.
+  template <class T, CPPDUALS_ENABLE_IF(!is_dual<T>::value && !is_complex<T>::value &&
+                                        std::is_arithmetic<T>::value)>
+  T dconj(const T &x) {
+    return x;
+  }
+
+  /// Exponential e^x
+  template <class T> dual<T> exp(const dual<T> &x) {
+    using std::exp;
+    T v = exp(x.rpart());
+    return dual<T>(v, v * x.dpart());
+  }
+
+  /// Natural log ln(x)
+  template <class T> dual<T> log(const dual<T> &x) {
+    using std::log;
+    T v = log(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, x.dpart() / x.rpart());
+  }
+
+  template <class T> dual<T> log10(const dual<T> &x) {
+    using std::log;
+    return log(x) / log(static_cast<T>(10));
+  }
+
+  template <class T> dual<T> log2(const dual<T> &x) {
+    using std::log;
+    return log(x) / log(static_cast<T>(2));
+  }
+
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U)>
+  common_t pow(const dual<T> &x, const U &y) {
+    using std::pow;
+    typedef typename common_t::value_type V;
+
+    return common_t(pow(x.rpart(), y),
+                    x.dpart() == V(0) ? V(0) : x.dpart() * y * pow(x.rpart(), y - U(1)));
+  }
+
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U)>
+  common_t pow(const U &x, const dual<T> &y) {
+    return pow(common_t(x), y);
+  }
+
+  template <class T, class U, CPPDUALS_ENABLE_SAME_DEPTH_AND_COMMON_T(T, U)>
+  common_t pow(const dual<T> &f, const dual<U> &g) {
+    using std::log;
+    using std::pow;
+#if 1
+    using std::floor;
+    typedef typename common_t::value_type V;
+    common_t result;
+
+    if (f.rpart() == T(0) && g.rpart() >= U(1)) {
+      if (g.rpart() > U(1)) { result = common_t(0); }
+      else { result = f; }
+    }
+    else {
+      if (f.rpart() < T(0) && g.rpart() == floor(g.rpart())) {
+        V const tmp = g.rpart() * pow(f.rpart(), g.rpart() - U(1.0));
+        result = common_t(pow(f.rpart(), g.rpart()), f.dpart() == T(0) ? T(0) : f.dpart() * tmp);
+        if (g.dpart() != U(0.0)) {
+          // Return a NaN when g.dpart() != 0.
+          result.dpart(std::numeric_limits<T>::quiet_NaN());
+        }
+      }
+      else {
+        // Handle the remaining cases. For cases 4,5,6,9 we allow the log()
+        // function to generate -HUGE_VAL or NaN, since those cases result in a
+        // nonfinite derivative.
+        V const tmp1 = pow(f.rpart(), g.rpart());
+        V const tmp2 = g.rpart() * pow(f.rpart(), g.rpart() - T(1.0));
+        V const tmp3 = tmp1 * log(f.rpart());
+        result = common_t(tmp1, f.dpart() == T(0) && g.dpart() == U(0)
+                                    ? V(0)
+                                    : tmp2 * f.dpart() + tmp3 * g.dpart());
+      }
+    }
+
+    return result;
+#else
+  T v = pow(f.rpart(), g.rpart());
+  return common_t(v, pow(f.rpart(), g.rpart() - T(1)) *
+                         (g.rpart() * f.dpart() + f.rpart() * log(f.rpart()) * g.dpart()));
+#endif
+  }
+
+  namespace utils {
+  template <typename T> int sgn(T val) { return (T(0) < val) - (val < T(0)); }
+  } // namespace utils
+
+  template <class T> dual<T> abs(const dual<T> &x) {
+    using std::abs;
+    return dual<T>(abs(x.rpart()), x.dpart() * utils::sgn(x.rpart()));
+  }
+
+  template <class T> dual<T> fabs(const dual<T> &x) {
+    using std::fabs;
+    return dual<T>(fabs(x.rpart()), x.dpart() * utils::sgn(x.rpart()));
+  }
+
+#if 0
+// TODO?
+template<class T> dual<T> abs2(const dual<T> & x) {
+  using std::abs;
+  return dual<T>(x.rpart() * x.rpart(),
+                 xxx x.dpart() * utils::sgn(x.rpart()));
+}
+#endif
+
+  template <class T> duals::dual<T> copysign(const duals::dual<T> &x, const duals::dual<T> &y) {
+    using std::copysign;
+    T r = copysign(x.rpart(), y.rpart());
+    return duals::dual<T>(r, r == x.rpart() ? x.dpart() : -x.dpart());
+  }
+
+  template <class T> duals::dual<T> hypot(const duals::dual<T> &x, const duals::dual<T> &y) {
+    return sqrt(x * x + y * y);
+  }
+
+  template <class T> duals::dual<T> scalbn(const duals::dual<T> &arg, int ex) {
+    return arg * std::pow((T)2, ex);
+  }
+
+  template <class T> duals::dual<T>(fmax)(const duals::dual<T> &x, const duals::dual<T> &y) {
+    return x.rpart() > y.rpart() ? x : y;
+  }
+
+  template <class T> duals::dual<T>(fmin)(const duals::dual<T> &x, const duals::dual<T> &y) {
+    return x.rpart() <= y.rpart() ? x : y;
+  }
+
+  template <class T> duals::dual<T> logb(const duals::dual<T> &x) { return duals::log2(x); }
+
+  template <class T> int(fpclassify)(const duals::dual<T> &d) {
+    using std::fpclassify;
+    return (fpclassify)(d.rpart());
+  }
+  template <class T> bool(isfinite)(const duals::dual<T> &d) {
+    using std::isfinite;
+    return (isfinite)(d.rpart());
+  }
+  template <class T> bool(isnormal)(const duals::dual<T> &d) {
+    using std::isnormal;
+    return (isnormal)(d.rpart());
+  }
+  template <class T> bool(isinf)(const duals::dual<T> &d) {
+    using std::isinf;
+    return (isinf)(d.rpart());
+  }
+  template <class T> bool(isnan)(const duals::dual<T> &d) {
+    using std::isnan;
+    return (isnan)(d.rpart());
+  }
+  template <class T> bool(signbit)(const duals::dual<T> &d) {
+    using std::signbit;
+    return (signbit)(d.rpart());
+  }
+
+  template <class T> dual<T> sqrt(const dual<T> &x) {
+    using std::sqrt;
+    T v = sqrt(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, x.dpart() / (T(2) * v));
+  }
+
+  template <class T> dual<T> cbrt(const dual<T> &x) {
+    using std::cbrt;
+    T v = cbrt(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, x.dpart() / (T(3) * v * v));
+  }
+
+  template <class T> dual<T> sin(const dual<T> &x) {
+    using std::cos;
+    using std::sin;
+    return dual<T>(sin(x.rpart()), x.dpart() * cos(x.rpart()));
+  }
+
+  template <class T> dual<T> cos(const dual<T> &x) {
+    using std::cos;
+    using std::sin;
+    return dual<T>(cos(x.rpart()), -sin(x.rpart()) * x.dpart());
+  }
+
+  template <class T> dual<T> tan(const dual<T> &x) {
+    using std::tan;
+    T v = tan(x.rpart());
+    return dual<T>(v, x.dpart() * (v * v + 1));
+  }
+
+  template <class T> dual<T> asin(const dual<T> &x) {
+    using std::asin;
+    using std::sqrt;
+    T v = asin(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, x.dpart() / sqrt(1 - x.rpart() * x.rpart()));
+  }
+
+  template <class T> dual<T> acos(const dual<T> &x) {
+    using std::acos;
+    using std::sqrt;
+    T v = acos(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, -x.dpart() / sqrt(1 - x.rpart() * x.rpart()));
+  }
+
+  template <class T> dual<T> atan(const dual<T> &x) {
+    using std::atan;
+    T v = atan(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else
+      return dual<T>(v, x.dpart() / (1 + x.rpart() * x.rpart()));
+  }
+
+  template <class T> dual<T> atan2(const dual<T> &y, const dual<T> &x) {
+    using std::atan2;
+    T v = atan2(y.rpart(), x.rpart());
+    return dual<T>(v, (x.rpart() * y.dpart() - y.rpart() * x.dpart()) /
+                          (y.rpart() * y.rpart() + x.rpart() * x.rpart()));
+  }
+
+  // TODO
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U)>
+  common_t atan2(const dual<T> &y, const U &x) {
+    using std::atan2;
+    T v = atan2(y.rpart(), x);
+    return dual<T>(v, (x * y.dpart()) / (y.rpart() * y.rpart() + x * x));
+  }
+
+  // TODO
+  template <class T, class U, CPPDUALS_ENABLE_LEQ_DEPTH_AND_COMMON_T(T, U)>
+  common_t atan2(const U &y, const dual<T> &x) {
+    using std::atan2;
+    T v = atan2(y, x.rpart());
+    return dual<T>(v, (-y * x.dpart()) / (y * y + x.rpart() * x.rpart()));
+  }
+
+  // TODO
+  template <class T> dual<T> sinh(const dual<T> &x);
+  template <class T> dual<T> cosh(const dual<T> &x);
+  template <class T> dual<T> tanh(const dual<T> &x);
+  template <class T> dual<T> asinh(const dual<T> &x);
+  template <class T> dual<T> acosh(const dual<T> &x);
+  template <class T> dual<T> atanh(const dual<T> &x);
+  template <class T> dual<T> log1p(const dual<T> &x);
+  template <class T> dual<T> expm1(const dual<T> &x);
+
+  ////////////////////////////////////////////////////////////////
+  // added for meep forward-diff operations
+  ////////////////////////////////////////////////////////////////
+
+  /// The tanh function.  Make sure to `#include <math.h>` before
+  /// `#include <duals/dual>` to use this function.
+  template <class T> dual<T> tanh(const dual<T> &x) {
+    using std::tanh;
+    T sech = 1.0 / std::cosh(x.rpart());
+    return dual<T>(tanh(x.rpart()), x.dpart() * sech * sech);
+  }
+
+  ////////////////////////////////////////////////////////////////
+  //
+  ////////////////////////////////////////////////////////////////
+
+  /// The error function.  Make sure to `#include <math.h>` before
+  /// `#include <duals/dual>` to use this function.
+  template <class T> dual<T> erf(const dual<T> &x) {
+    using std::erf;
+    using std::exp;
+    using std::pow;
+    using std::sqrt;
+    return dual<T>(erf(x.rpart()), x.dpart() * T(2) / sqrt(T(MY_PI)) * exp(-pow(x.rpart(), T(2))));
+  }
+
+  /// Error function complement (1 - erf()).
+  template <class T> dual<T> erfc(const dual<T> &x) {
+    using std::erfc;
+    using std::exp;
+    using std::pow;
+    using std::sqrt;
+    return dual<T>(erfc(x.rpart()),
+                   x.dpart() * -T(2) / sqrt(T(MY_PI)) * exp(-pow(x.rpart(), T(2))));
+  }
+
+  /// Gamma function.  Approximation of the dual part.
+  // TODO specialize for integers
+  template <class T> dual<T> tgamma(const dual<T> &x) {
+    using std::tgamma;
+    T v = tgamma(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else {
+      int errno_saved = errno;
+      T h(T(1) / (1ull << (std::numeric_limits<T>::digits / 3)));
+      T w((tgamma(x.rpart() + h) - tgamma(x.rpart() - h)) / (2 * h));
+      errno = errno_saved;
+      return dual<T>(v, x.dpart() * w);
+    }
+  }
+
+  /// Log of absolute value of gamma function.  Approximation of the dual part.
+  template <class T> dual<T> lgamma(const dual<T> &x) {
+    using std::lgamma;
+    T v = lgamma(x.rpart());
+    if (x.dpart() == T(0))
+      return v;
+    else {
+#if CPPDUALS_HAVE_SIGNGAM
+      int signgam_saved = signgam;
+#endif
+      int errno_saved = errno;
+      T h(T(1) / (1ull << (std::numeric_limits<T>::digits / 3)));
+      T w((lgamma(x.rpart() + h) - lgamma(x.rpart() - h)) / (2 * h));
+#if CPPDUALS_HAVE_SIGNGAM
+      signgam = signgam_saved;
+#endif
+      errno = errno_saved;
+      return dual<T>(v, x.dpart() * w);
+    }
+  }
+
+  /// Putto operator
+  template <class T, class _CharT, class _Traits>
+  std::basic_ostream<_CharT, _Traits> &operator<<(std::basic_ostream<_CharT, _Traits> &os,
+                                                  const dual<T> &x) {
+    std::basic_ostringstream<_CharT, _Traits> s;
+    s.flags(os.flags());
+    s.imbue(os.getloc());
+    s.precision(os.precision());
+    s << '(' << x.rpart() << (x.dpart() < 0 ? "" : "+") << x.dpart() << "_e"
+      << (std::is_same<typename std::decay<T>::type, float>::value         ? "f"
+          : std::is_same<typename std::decay<T>::type, double>::value      ? ""
+          : std::is_same<typename std::decay<T>::type, long double>::value ? "l"
+                                                                           : "")
+      << ")";
+    return os << s.str();
+  }
+
+  /// Stream reader
+  template <class T, class CharT, class Traits>
+  std::basic_istream<CharT, Traits> &operator>>(std::basic_istream<CharT, Traits> &is, dual<T> &x) {
+    if (is.good()) {
+      ws(is);
+      if (is.peek() == CharT('(')) {
+        is.get();
+        T r;
+        is >> r;
+        if (!is.fail()) {
+          CharT c = is.peek();
+          if (c != CharT('_')) {
+            ws(is);
+            c = is.peek();
+          }
+          if (c == CharT('+') || c == CharT('-') || c == CharT('_')) {
+            if (c == CharT('+')) is.get();
+            T d;
+            if (c == CharT('_')) {
+              d = r;
+              r = 0;
+            }
+            else
+              is >> d;
+            if (!is.fail()) {
+              ws(is);
+              c = is.peek();
+              if (c == CharT('_')) {
+                is.get();
+                c = is.peek();
+                if (c == CharT('e')) {
+                  is.get();
+                  c = is.peek();
+                  if ((c == 'f' && !std::is_same<typename std::decay<T>::type, float>::value) ||
+                      (c == 'l' && !std::is_same<typename std::decay<T>::type, long double>::value))
+                    is.setstate(std::ios_base::failbit);
+                  else {
+                    if (c == 'f' || c == 'l') is.get();
+                    ws(is);
+                    c = is.peek();
+                    if (c == ')') {
+                      is.get();
+                      x = dual<T>(r, d);
+                    }
+                    else
+                      is.setstate(std::ios_base::failbit);
+                  }
+                }
+                else
+                  is.setstate(std::ios_base::failbit);
+              }
+              else
+                is.setstate(std::ios_base::failbit);
+            }
+            else
+              is.setstate(std::ios_base::failbit);
+          }
+          else if (c == CharT(')')) {
+            is.get();
+            x = dual<T>(r, T(0));
+          }
+          else
+            is.setstate(std::ios_base::failbit);
+        }
+        else
+          is.setstate(std::ios_base::failbit);
+      }
+      else {
+        T r;
+        is >> r;
+        if (!is.fail())
+          x = dual<T>(r, T(0));
+        else
+          is.setstate(std::ios_base::failbit);
+      }
+    }
+    else
+      is.setstate(std::ios_base::failbit);
+    return is;
+  }
+
+#if __cpp_user_defined_literals >= 200809
+  /// Dual number literals in namespace duals::literals
+  inline namespace literals {
+  using duals::dual;
+  constexpr dual<float> operator"" _ef(long double du) { return {0.0f, static_cast<float>(du)}; }
+  constexpr dual<float> operator"" _ef(unsigned long long du) {
+    return {0.0f, static_cast<float>(du)};
+  }
+  constexpr dual<double> operator"" _e(long double du) { return {0.0, static_cast<double>(du)}; }
+  constexpr dual<double> operator"" _e(unsigned long long du) {
+    return {0.0, static_cast<double>(du)};
+  }
+  constexpr dual<long double> operator"" _el(long double du) { return {0.0l, du}; }
+  constexpr dual<long double> operator"" _el(unsigned long long du) {
+    return {0.0l, static_cast<long double>(du)};
+  }
+  } // namespace literals
+#endif
+
+  typedef dual<float> dualf;
+  typedef dual<double> duald;
+  typedef dual<long double> dualld;
+  typedef dual<dualf> hyperdualf;
+  typedef dual<duald> hyperduald;
+  typedef dual<dualld> hyperdualld;
+  typedef std::complex<dualf> cdualf;
+  typedef std::complex<duald> cduald;
+  typedef std::complex<dualld> cdualld;
+
+} // namespace duals
+
+#ifdef CPPDUALS_LIBFMT
+#include <fmt/format.h>
+/// duals::dual<> Formatter for libfmt https://github.com/fmtlib/fmt
+///
+/// Formats a dual number (r,d) as (r+d_e), offering the same
+/// formatting options as the underlying type - with the addition of
+/// three optional format options, only one of which may appear
+/// directly after the ':' in the format spec: '$', '*', and ','.  The
+/// '*' flag changes the separating _ to a *, producing (r+d*e), where
+/// r and d are the formatted value_type values.  The ',' flag simply
+/// prints the real and dual parts separated by a comma.  As a
+/// concrete exmple, this formatter can produce either (3+5.4_e) or
+/// (3+5.4*e) or (3,5.4) for a dual<double> using the specs {:g},
+/// {:*g}, or {:,g}, respectively.  When the '*' is NOT specified, the
+/// output should be compatible with the input operator>> and the dual
+/// literals below. (this implementation is a bit hacky - glad for
+/// cleanups).
+template <typename T, typename Char>
+struct fmt::formatter<duals::dual<T>, Char> : public fmt::formatter<T, Char> {
+  typedef fmt::formatter<T, Char> base;
+  enum style { expr, star, pair } style_ = expr;
+  fmt::detail::dynamic_format_specs<Char> specs_;
+  FMT_CONSTEXPR auto parse(format_parse_context &ctx) -> decltype(ctx.begin()) {
+    using handler_type = fmt::detail::dynamic_specs_handler<format_parse_context>;
+    auto type = fmt::detail::type_constant<T, Char>::value;
+    fmt::detail::specs_checker<handler_type> handler(handler_type(specs_, ctx), type);
+    auto it = ctx.begin();
+    if (it != ctx.end()) switch (*it) {
+        case '$':
+          style_ = style::expr;
+          ctx.advance_to(++it);
+          break;
+        case '*':
+          style_ = style::star;
+          ctx.advance_to(++it);
+          break;
+        case ',':
+          style_ = style::pair;
+          ctx.advance_to(++it);
+          break;
+        default: break;
+      }
+    parse_format_specs(ctx.begin(), ctx.end(), handler);
+    return base::parse(ctx);
+  }
+  template <typename FormatCtx>
+  auto format(const duals::dual<T> &x, FormatCtx &ctx) -> decltype(ctx.out()) {
+    format_to(ctx.out(), "(");
+    if (style_ == style::pair) {
+      base::format(x.rpart(), ctx);
+      format_to(ctx.out(), ",");
+      base::format(x.dpart(), ctx);
+      return format_to(ctx.out(), ")");
+    }
+    if (x.rpart() || !x.dpart()) base::format(x.rpart(), ctx);
+    if (x.dpart()) {
+      if (x.rpart() && x.dpart() >= 0 && specs_.sign != sign::plus) format_to(ctx.out(), "+");
+      base::format(x.dpart(), ctx);
+      if (style_ == style::star)
+        format_to(ctx.out(), "*e");
+      else
+        format_to(ctx.out(), "_e");
+      if (std::is_same<typename std::decay<T>::type, float>::value) format_to(ctx.out(), "f");
+      if (std::is_same<typename std::decay<T>::type, long double>::value) format_to(ctx.out(), "l");
+    }
+    return format_to(ctx.out(), ")");
+  }
+};
+#endif
+
+#ifdef CPPDUALS_LIBFMT_COMPLEX
+#ifndef CPPDUALS_LIBFMT
+#include <fmt/format.h>
+#endif
+/// std::complex<> Formatter for libfmt https://github.com/fmtlib/fmt
+///
+/// libfmt does not provide a formatter for std::complex<>, although
+/// one is proposed for c++20.  Anyway, at the expense of a k or two,
+/// you can define CPPDUALS_LIBFMT_COMPLEX and get this one.
+///
+/// The standard iostreams formatting of complex numbers is (a,b),
+/// where a and b are the real and imaginary parts.  This formats a
+/// complex number (a+bi) as (a+bi), offering the same formatting
+/// options as the underlying type - with the addition of three
+/// optional format options, only one of which may appear directly
+/// after the ':' in the format spec (before any fill or align): '$'
+/// (the default if no flag is specified), '*', and ','.  The '*' flag
+/// adds a * before the 'i', producing (a+b*i), where a and b are the
+/// formatted value_type values.  The ',' flag simply prints the real
+/// and complex parts separated by a comma (same as iostreams' format).
+/// As a concrete exmple, this formatter can produce either (3+5.4i)
+/// or (3+5.4*i) or (3,5.4) for a complex<double> using the specs {:g}
+/// | {:$g}, {:*g}, or {:,g}, respectively.  (this implementation is a
+/// bit hacky - glad for cleanups).
+///
+template <typename T, typename Char>
+struct fmt::formatter<std::complex<T>, Char> : public fmt::formatter<T, Char> {
+  typedef fmt::formatter<T, Char> base;
+  enum style { expr, star, pair } style_ = expr;
+  fmt::detail::dynamic_format_specs<Char> specs_;
+  FMT_CONSTEXPR auto parse(format_parse_context &ctx) -> decltype(ctx.begin()) {
+    using handler_type = fmt::detail::dynamic_specs_handler<format_parse_context>;
+    auto type = fmt::detail::type_constant<T, Char>::value;
+    fmt::detail::specs_checker<handler_type> handler(handler_type(specs_, ctx), type);
+    auto it = ctx.begin();
+    if (it != ctx.end()) {
+      switch (*it) {
+        case '$':
+          style_ = style::expr;
+          ctx.advance_to(++it);
+          break;
+        case '*':
+          style_ = style::star;
+          ctx.advance_to(++it);
+          break;
+        case ',':
+          style_ = style::pair;
+          ctx.advance_to(++it);
+          break;
+        default: break;
+      }
+    }
+    parse_format_specs(ctx.begin(), ctx.end(), handler);
+    // todo: fixup alignment
+    return base::parse(ctx);
+  }
+  template <typename FormatCtx>
+  auto format(const std::complex<T> &x, FormatCtx &ctx) -> decltype(ctx.out()) {
+    format_to(ctx.out(), "(");
+    if (style_ == style::pair) {
+      base::format(x.real(), ctx);
+      format_to(ctx.out(), ",");
+      base::format(x.imag(), ctx);
+      return format_to(ctx.out(), ")");
+    }
+    if (x.real() || !x.imag()) base::format(x.real(), ctx);
+    if (x.imag()) {
+      if (x.real() && x.imag() >= 0 && specs_.sign != sign::plus) format_to(ctx.out(), "+");
+      base::format(x.imag(), ctx);
+      if (style_ == style::star)
+        format_to(ctx.out(), "*i");
+      else
+        format_to(ctx.out(), "i");
+      if (std::is_same<typename std::decay<T>::type, float>::value) format_to(ctx.out(), "f");
+      if (std::is_same<typename std::decay<T>::type, long double>::value) format_to(ctx.out(), "l");
+    }
+    return format_to(ctx.out(), ")");
+  }
+};
+#endif
+
+#endif // CPPDUALS_DUAL