diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f467b2aee33..62c3472aebd 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -28,6 +28,7 @@
 /src/KIM/             @ellio167
 /src/KOKKOS/          @stanmoore1
 /src/MANIFOLD/        @Pakketeretet2
+/src/MBX/             @Miniland1333
 /src/MDI/             @taylor-a-barnes @sjplimp
 /src/MEAM/            @martok
 /src/MESONT/          @iafoss
@@ -55,6 +56,7 @@
 /src/UEF/             @danicholson
 
 # individual files in packages
+/src/ASPHERE/*superellipsoid*        @jtclemm @jibril-b-coulibaly @JBil8
 /src/GPU/pair_vashishta_gpu.*        @andeplane
 /src/KOKKOS/pair_vashishta_kokkos.*  @andeplane @stanmoore1
 /src/KOSSOS/pair_pod_kokkos.*        @exapde @stanmoore1
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 19d40100ea8..4b64a72d199 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -244,6 +244,8 @@ lammps/
 
 5. **Documentation:** All new commands or features must be documented. Put `.. versionadded:: TBD` or
    `.. versionchanged:: TBD` in front of paragraphs documenting the new or changed functionality.
+   The `.. versionadded:: TBD` directive should be used with new features or added keywords.
+   The `.. versionchanged:: TBD` directive should be used when the behavior of a keyword changes.
    The `TBD` will be manually replaced with the release version string during the release preparation.
    This does not apply when the change is only adding an accelerated version of an existing style.
    Instead the corresponding code letter should be added to the respective Commands_\*.rst file.
diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst
index b127227cdc5..90ed4733b06 100644
--- a/doc/src/Commands_bond.rst
+++ b/doc/src/Commands_bond.rst
@@ -129,7 +129,7 @@ OPT.
    * :doc:`helix (o) <dihedral_helix>`
    * :doc:`lepton (o) <dihedral_lepton>`
    * :doc:`multi/harmonic (ko) <dihedral_multi_harmonic>`
-   * :doc:`nharmonic (o) <dihedral_nharmonic>`
+   * :doc:`nharmonic (ko) <dihedral_nharmonic>`
    * :doc:`opls (iko) <dihedral_opls>`
    * :doc:`quadratic (o) <dihedral_quadratic>`
    * :doc:`spherical <dihedral_spherical>`
diff --git a/doc/src/Howto_spherical.rst b/doc/src/Howto_spherical.rst
index bdbd7cc4de9..e5e8f3e2ac6 100644
--- a/doc/src/Howto_spherical.rst
+++ b/doc/src/Howto_spherical.rst
@@ -3,10 +3,10 @@ Finite-size spherical and aspherical particles
 
 Typical MD models treat atoms or particles as point masses.  Sometimes
 it is desirable to have a model with finite-size particles such as
-spheroids or ellipsoids or generalized aspherical bodies.  The
-difference is that such particles have a moment of inertia, rotational
-energy, and angular momentum.  Rotation is induced by torque coming
-from interactions with other particles.
+spheroids, ellipsoids, superellipsoids, or generalized aspherical
+bodies.  The difference is that such particles have a moment of inertia,
+rotational energy, and angular momentum.  Rotation is induced by torque
+coming from interactions with other particles.
 
 LAMMPS has several options for running simulations with these kinds of
 particles.  The following aspects are discussed in turn:
@@ -28,13 +28,13 @@ There are several :doc:`atom styles <atom_style>` that allow for
 definition of finite-size particles: sphere, dipole, ellipsoid, line,
 tri, peri, and body.
 
-The sphere style defines particles that are spheroids and each
+The *sphere* style defines particles that are spheroids and each
 particle can have a unique diameter and mass (or density).  These
 particles store an angular velocity (omega) and can be acted upon by
 torque.  The "set" command can be used to modify the diameter and mass
 of individual particles, after then are created.
 
-The dipole style does not actually define finite-size particles, but
+The *dipole* style does not actually define finite-size particles, but
 is often used in conjunction with spherical particles, via a command
 like
 
@@ -44,42 +44,45 @@ like
 
 This is because when dipoles interact with each other, they induce
 torques, and a particle must be finite-size (i.e. have a moment of
-inertia) in order to respond and rotate.  See the :doc:`atom_style dipole <atom_style>` command for details.  The "set" command can be
-used to modify the orientation and length of the dipole moment of
-individual particles, after then are created.
-
-The ellipsoid style defines particles that are ellipsoids and thus can
-be aspherical.  Each particle has a shape, specified by 3 diameters,
-and mass (or density).  Superellipsoid particles can be defined by
-specifying 2 blockiness exponents (block) and adding the `superellipsoid`
-keyword to the `atom_style ellipsoid` command.  These particles store an angular
-momentum and their orientation (quaternion), and can be acted upon by
-torque.  They do not store an angular velocity (omega), which can be
-in a different direction than angular momentum, rather they compute it
-as needed.  The "set" command can be used to modify the diameter, orientation,
-and mass of individual particles, after they are created.
-The "set" command can also be used to modify the blockiness of superellipsoid
-particles. It also has a brief explanation of what quaternions are.
-
-The line style defines line segment particles with two end points and
+inertia) in order to respond and rotate.  See the :doc:`atom_style
+dipole <atom_style>` command for details.  The "set" command can be used
+to modify the orientation and length of the dipole moment of individual
+particles, after then are created.
+
+The *ellipsoid* style defines particles that are ellipsoids or
+superellipsoids and thus can be aspherical.  Each particle has a shape,
+specified by 3 diameters, and mass (or density).  Superellipsoid
+particles can be defined by additionally specifying 2 blockiness
+exponents (block) and adding the `superellipsoid` keyword to the
+:doc:`atom_style ellipsoid <atom_style>` command.
+
+These particles store an angular momentum and their orientation
+(quaternion), and can be acted upon by torque.  They do not store an
+angular velocity (omega), which can be in a different direction than
+angular momentum, rather they compute it as needed.  The :doc:`set
+command <set>` can be used to modify the diameter, orientation, and mass
+of individual particles, after they are created.  The :doc:`set command
+<set>` can also be used to modify the blockiness of superellipsoid
+particles.  It also has a brief explanation of what quaternions are.
+
+The *line* style defines line segment particles with two end points and
 a mass (or density).  They can be used in 2d simulations, and they can
 be joined together to form rigid bodies which represent arbitrary
 polygons.
 
-The tri style defines triangular particles with three corner points
+The *tri* style defines triangular particles with three corner points
 and a mass (or density).  They can be used in 3d simulations, and they
 can be joined together to form rigid bodies which represent arbitrary
 particles with a triangulated surface.
 
-The peri style is used with :doc:`Peridynamic models <pair_peri>` and
+The *peri* style is used with :doc:`Peridynamic models <pair_peri>` and
 defines particles as having a volume, that is used internally in the
 :doc:`pair_style peri <pair_peri>` potentials.
 
-The body style allows for definition of particles which can represent
-complex entities, such as surface meshes of discrete points,
-collections of sub-particles, deformable objects, etc.  The body style
-is discussed in more detail on the :doc:`Howto body <Howto_body>` doc
-page.
+The *body* style allows for definition of particles which can represent
+complex entities, such as surface meshes of discrete points, collections
+of sub-particles, deformable objects, etc.  The body style is discussed
+in more detail on the :doc:`Howto body <Howto_body>` doc page.
 
 Note that if one of these atom styles is used (or multiple styles via
 the :doc:`atom_style hybrid <atom_style>` command), not all particles in
@@ -88,9 +91,10 @@ the system are required to be finite-size or aspherical.
 For example, in the ellipsoid style, if the 3 shape parameters are set
 to the same value, the particle will be a sphere rather than an
 ellipsoid.  If the 3 shape parameters are all set to 0.0 or if the
-diameter is set to 0.0, it will be a point particle.  In the line or
-tri style, if the lineflag or triflag is specified as 0, then it
-will be a point particle.
+diameter is set to 0.0, it will be a point particle.  In the line or tri
+style, if the lineflag or triflag is specified as 0, then it will be a
+point particle.  Similarly, if a superellipsoid has both blockiness
+parameters set to 1.0, the superellipsoid will be a regular ellipsoid.
 
 Some of the pair styles used to compute pairwise interactions between
 finite-size particles also compute the correct interaction with point
@@ -98,10 +102,10 @@ particles as well, e.g. the interaction between a point particle and a
 finite-size particle or between two point particles.  If necessary,
 :doc:`pair_style hybrid <pair_hybrid>` can be used to ensure the correct
 interactions are computed for the appropriate style of interactions.
-Likewise, using groups to partition particles (ellipsoids versus
-spheres versus point particles) will allow you to use the appropriate
-time integrators and temperature computations for each class of
-particles.  See the doc pages for various commands for details.
+Likewise, using groups to partition particles (ellipsoids versus spheres
+versus point particles) will allow you to use the appropriate time
+integrators and temperature computations for each class of particles.
+See the doc pages for various commands for details.
 
 Also note that for :doc:`2d simulations <dimension>`, atom styles sphere
 and ellipsoid still use 3d particles, rather than as circular disks or
@@ -130,15 +134,15 @@ such interactions.  These are the various :doc:`pair styles <pair_style>` that g
 * :doc:`pair_style tri/lj <pair_tri_lj>`
 * :doc:`pair_style body/nparticle <pair_body_nparticle>`
 
-The granular pair styles are used with spherical particles.  The
-*granular/superellipsoid* granular pair styles are used with superellipsoid particles.
-The dipole pair style is used with the dipole atom style, which could be
-applied to spherical or ellipsoidal particles.  The GayBerne and
-REsquared potentials require ellipsoidal particles, though they will
-also work if the 3 shape parameters are the same (a sphere).  The
-Brownian and lubrication potentials are used with spherical particles.
-The line, tri, and body potentials are used with line segment,
-triangular, and body particles respectively.
+Most of the granular pair styles are used with spherical particles with
+the exception of the *granular/superellipsoid* pair style which is used
+with superellipsoid particles.  The dipole pair style is used with the
+dipole atom style, which could be applied to spherical or ellipsoidal
+particles.  The GayBerne and REsquared potentials require ellipsoidal
+particles, though they will also work if the 3 shape parameters are the
+same (a sphere).  The Brownian and lubrication potentials are used with
+spherical particles.  The line, tri, and body potentials are used with
+line segment, triangular, and body particles respectively.
 
 Time integration
 ----------------
@@ -189,13 +193,13 @@ rotational energy of spherical or ellipsoidal particles:
 * :doc:`compute erotate/asphere <compute_erotate_asphere>`
 
 These include rotational degrees of freedom in their computation.  If
-you wish the thermodynamic output of temperature or pressure to use
-one of these computes (e.g. for a system entirely composed of
-finite-size particles), then the compute can be defined and the
-:doc:`thermo_modify <thermo_modify>` command used.  Note that by default
-thermodynamic quantities will be calculated with a temperature that
-only includes translational degrees of freedom.  See the
-:doc:`thermo_style <thermo_style>` command for details.
+you wish the thermodynamic output of temperature or pressure to use one
+of these computes (e.g. for a system entirely composed of finite-size
+particles), then the compute can be defined and the :doc:`thermo_modify
+<thermo_modify>` command used.  Note that by default thermodynamic
+quantities will be calculated with a temperature that only includes
+translational degrees of freedom.  See the :doc:`thermo_style
+<thermo_style>` command for details.
 
 These commands can be used to output various attributes of finite-size
 particles:
@@ -205,10 +209,10 @@ particles:
 * :doc:`dump local <dump>`
 * :doc:`compute body/local <compute_body_local>`
 
-Attributes include the dipole moment, the angular velocity, the
-angular momentum, the quaternion, the torque, the end-point and
-corner-point coordinates (for line and tri particles), and
-sub-particle attributes of body particles.
+Attributes include the dipole moment, the angular velocity, the angular
+momentum, the quaternion, the torque, the end-point and corner-point
+coordinates (for line and tri particles), and sub-particle attributes of
+body particles.
 
 Rigid bodies composed of finite-size particles
 ----------------------------------------------
@@ -221,23 +225,25 @@ constituent particles, and integrates the motion of the rigid body.
 If any of the constituent particles of a rigid body are finite-size
 particles (spheres or ellipsoids or line segments or triangles), then
 their contribution to the inertia tensor of the body is different than
-if they were point particles.  This means the rotational dynamics of
-the rigid body will be different.  Thus a model of a dimer is
-different if the dimer consists of two point masses versus two
-spheroids, even if the two particles have the same mass.  Finite-size
-particles that experience torque due to their interaction with other
-particles will also impart that torque to a rigid body they are part
-of.
+if they were point particles.  This means the rotational dynamics of the
+rigid body will be different.  Thus a model of a dimer is different if
+the dimer consists of two point masses versus two spheroids, even if the
+two particles have the same mass.  Finite-size particles that experience
+torque due to their interaction with other particles will also impart
+that torque to a rigid body they are part of.
 
-See the "fix rigid" command for example of complex rigid-body models
-it is possible to define in LAMMPS.
+See the "fix rigid" command for example of complex rigid-body models it
+is possible to define in LAMMPS.
 
 Note that the :doc:`fix shake <fix_shake>` command can also be used to
 treat 2, 3, or 4 particles as a rigid body, but it always assumes the
 particles are point masses.
 
-Also note that body particles cannot be modeled with the :doc:`fix rigid <fix_rigid>` command.  Body particles are treated by LAMMPS
-as single particles, though they can store internal state, such as a
-list of sub-particles.  Individual body particles are typically treated
-as rigid bodies, and their motion integrated with a command like :doc:`fix nve/body <fix_nve_body>`.  Interactions between pairs of body
-particles are computed via a command like :doc:`pair_style body/nparticle <pair_body_nparticle>`.
+Also note that body particles cannot be modeled with the :doc:`fix rigid
+<fix_rigid>` command.  Body particles are treated by LAMMPS as single
+particles, though they can store an internal state, such as a list of
+sub-particles.  Individual body particles are typically treated as rigid
+bodies, and their motion integrated with a command like :doc:`fix
+nve/body <fix_nve_body>`.  Interactions between pairs of body particles
+are computed via a command like :doc:`pair_style body/nparticle
+<pair_body_nparticle>`.
diff --git a/doc/src/Howto_viz.rst b/doc/src/Howto_viz.rst
index 6a64f535121..7a703b5d818 100644
--- a/doc/src/Howto_viz.rst
+++ b/doc/src/Howto_viz.rst
@@ -605,14 +605,15 @@ faces (*bflag1* value 1), or both (*bflag1* value 3).
 
 -------------
 
-Visualizing ellipsoid particles
--------------------------------
+Visualizing ellipsoid and superellipsoid particles
+--------------------------------------------------
 
 .. versionadded:: 11Feb2026
 
 Ellipsoidal particles are a generalization of spheres that may have
-three different radii to define the shape.  They can be modeled using
-pair styles like :doc:`gayberne <pair_gayberne>` or :doc:`resquared
+three different radii to define the shape.  Superellipsoids are in turn
+a generalization of ellipsoids.  They can be modeled using pair styles
+like :doc:`gayberne <pair_gayberne>` or :doc:`resquared
 <pair_resquared>`.  The regular :doc:`dump custom <dump>` command can
 output the center of those bodies, the shape parameters and the
 orientation as quaternions.  If one follows the required conventions and
@@ -622,49 +623,75 @@ follows the documented steps, those trajectory dump files can be
 
 .. versionchanged:: TBD
 
-   Now uses curved triangles instead of flat ones; "both" option is removed
-
-As an alternative, the ellipsoid particles can be visualized directly
-with :doc:`dump image <dump_image>` using the *ellipsoid* keyword.  The
-color and transparency settings can be changed by setting those
-properties for the corresponding atom types.  It is also possible to
-represent the ellipsoids via generating a triangle mesh and visualizing
-it as either wireframes (*eflag* value 2) or rounded faces (*eflag*
-value 1).  The use of a triangle mesh is currently required since the
-rasterizer built into LAMMPS does not offer a suitable graphics
-primitive for ellipsoids.  The mesh is constructed by iteratively
-refining a triangle mesh representing an icosahedron where each triangle
-is replaced by four triangles in each iteration.  For a sufficiently
-smooth representation a refinement level of 4 to 6 is required (see
-example images below).  A high refinement level can cause a significant
-slowdown of the rendering of the image due to the large number of
-triangles that need to be drawn.  This will be more pronounced when
-enabling FSAA or SSAO or both.
-
-.. |ellipsoid1| image:: img/ellipsoid-level2.png
-   :width: 33%
-.. |ellipsoid2| image:: img/ellipsoid-level4.png
-   :width: 33%
-.. |ellipsoid3| image:: img/ellipsoid-level6.png
-   :width: 33%
+   Now uses curved triangles instead of flat ones; "both" option is removed; support for superellipsoids was added
+
+As an alternative, the ellipsoid and superellipsoid particles can be
+visualized directly with :doc:`dump image <dump_image>` using the
+*ellipsoid* keyword.  The color and transparency settings can be changed
+by setting those properties for the corresponding atom types.  It is
+also possible to represent the ellipsoids via generating a triangle mesh
+and visualizing it as either wireframes (*eflag* value 2) or rounded
+triangle faces (*eflag* value 1).  The use of a triangle mesh is
+currently required since the rasterizer built into LAMMPS does not offer
+suitable graphics primitives for ellipsoids or superellipsoids.  The
+mesh is constructed by iteratively refining a triangle mesh representing
+an icosahedron, where each triangle is replaced by four triangles in
+each iteration.  For a smooth representation a refinement level of 4
+seems sufficient, but high resolution images may benefit from a higher
+level (maximum is 6, see example images below).  A high refinement level
+can cause a significant slowdown of the rendering of the image due to
+the large number of triangles that need to be computed and drawn.  This
+slowdown will be more pronounced when enabling FSAA or SSAO or both.
+
+.. |ellipsoid1| image:: img/ellipsoid-mesh.png
+   :width: 24%
+.. |ellipsoid2| image:: img/ellipsoid-level2.png
+   :width: 24%
+.. |ellipsoid3| image:: img/ellipsoid-level4.png
+   :width: 24%
+.. |ellipsoid4| image:: img/ellipsoid-level6.png
+   :width: 24%
 
-|ellipsoid1|  |ellipsoid2|  |ellipsoid3|
+|ellipsoid1|  |ellipsoid2|  |ellipsoid3|  |ellipsoid4|
 
 .. raw:: html
 
-   <center>(Ellipsoid particle visualization examples for different mesh refinement levels.
-   left: level 2, center: level 4, right: level 6. Click to see the full-size images)</center><br>
+   <center>(Ellipsoid particle visualization examples for different mesh
+         levels.  from left to right: wireframe level 3, triangles level
+         2, triangles level 4, triangles level 6. Click to see the
+         full-size images)</center><br>
 
 These images were created by adding the following :doc:`dump image and dump_modify <dump_image>`
 commands to the ``in.ellipse.resquared`` input example:
 
 .. code-block:: LAMMPS
 
-   #                                                       change + this
-   dump viz all image 1000 image-*.png type type ellipsoid type 3 4 0.05 &
-         size 600 600 zoom 2.2 shiny 0.1 fsaa yes view 80 -10 box yes 0.025 &
-         axes no 0.0 0.0 center s 0.5 0.5 0.5 ssao yes 32185474 0.6
-   dump_modify viz pad 9 boxcolor white backcolor gray adiam 1 4 adiam 2 7
+   #                                                   change /V\ this
+   dump viz all image 1000 image-*.png x type ellipsoid atom 1 4 0.2 &
+        size 600 600 zoom 1.331 view 80 20 box yes 0.025 shiny 0.2 fsaa yes
+   dump_modify viz pad 6 boxcolor goldenrod backcolor black backcolor2 white &
+        color map1 0.459 0.055 0.075 color map2 0.000 0.227 0.427 &
+        amap min max cf 0.0 5 min map1 0.1 map1 0.5 white 0.9 map2 max map2
+
+.. versionadded:: TBD
+
+The visualization of superellipsoids works exactly the same way as for
+ellipsoids by creating a triangle mesh of an icosahedron and refining
+and deforming it.  The difference is merely internally the applied
+deformation function and the corresponding computation of the surface
+normals.  LAMMPS will auto-detect which function to use.  Some
+visualizations of the ``in.drop_test``, the ``in.bowling``, and the
+``in.super_table`` examples from the
+``examples/ASPHERE/superellipsoid_gran`` folder are shown below.
+
+.. |superellipsoid1| image:: img/superellipsoids-drop.png
+   :width: 23%
+.. |superellipsoid2| image:: img/superellipsoids-bowl.png
+   :width: 41%
+.. |superellipsoid3| image:: img/superellipsoids-zoo.png
+   :width: 31%
+
+|superellipsoid1|  |superellipsoid2|  |superellipsoid3|
 
 -------------
 
diff --git a/doc/src/dihedral_nharmonic.rst b/doc/src/dihedral_nharmonic.rst
index dffdb8ee96f..9784ddceda0 100644
--- a/doc/src/dihedral_nharmonic.rst
+++ b/doc/src/dihedral_nharmonic.rst
@@ -1,10 +1,11 @@
 .. index:: dihedral_style nharmonic
+.. index:: dihedral_style nharmonic/kk
 .. index:: dihedral_style nharmonic/omp
 
 dihedral_style nharmonic command
 ================================
 
-Accelerator Variants: *nharmonic/omp*
+Accelerator Variants: *nharmonic/kk*, *nharmonic/omp*
 
 Syntax
 """"""
diff --git a/doc/src/fix_nh.rst b/doc/src/fix_nh.rst
index 9c2f8d606d9..bd207b88b43 100644
--- a/doc/src/fix_nh.rst
+++ b/doc/src/fix_nh.rst
@@ -40,7 +40,7 @@ Syntax
 
   .. parsed-literal::
 
-     keyword = *temp* or *iso* or *aniso* or *tri* or *x* or *y* or *z* or *xy* or *yz* or *xz* or *couple* or *tchain* or *pchain* or *mtk* or *tloop* or *ploop* or *nreset* or *drag* or *ptemp* or *dilate* or *scalexy* or *scaleyz* or *scalexz* or *flip* or *fixedpoint* or *update*
+     keyword = *temp* or *iso* or *aniso* or *tri* or *x* or *y* or *z* or *xy* or *yz* or *xz* or *couple* or *tchain* or *pchain* or *mtk* or *tloop* or *ploop* or *nreset* or *drag* or *ptemp* or *dilate* or *scalexy* or *scaleyz* or *scalexz* or *flip* or *isochoric* or *fixedpoint* or *update*
        *temp* values = Tstart Tstop Tdamp
          Tstart,Tstop = external temperature at start/end of run
          Tdamp = temperature damping parameter (time units)
@@ -95,31 +95,29 @@ These commands perform time integration on Nose-Hoover style
 non-Hamiltonian equations of motion which are designed to generate
 positions and velocities sampled from the canonical (nvt),
 isothermal-isobaric (npt), and isenthalpic (nph) ensembles.  This
-updates the position and velocity for atoms in the group each
-timestep.
+updates the position and velocity for atoms in the group each timestep.
 
 The thermostatting and barostatting is achieved by adding some dynamic
-variables which are coupled to the particle velocities
-(thermostatting) and simulation domain dimensions (barostatting).  In
-addition to basic thermostatting and barostatting, these fixes can
-also create a chain of thermostats coupled to the particle thermostat,
-and another chain of thermostats coupled to the barostat
-variables. The barostat can be coupled to the overall box volume, or
-to individual dimensions, including the *xy*, *xz* and *yz* tilt
-dimensions. The external pressure of the barostat can be specified as
-either a scalar pressure (isobaric ensemble) or as components of a
-symmetric stress tensor (constant stress ensemble).  When used
-correctly, the time-averaged temperature and stress tensor of the
-particles will match the target values specified by Tstart/Tstop and
-Pstart/Pstop.
+variables which are coupled to the particle velocities (thermostatting)
+and simulation domain dimensions (barostatting).  In addition to basic
+thermostatting and barostatting, these fixes can also create a chain of
+thermostats coupled to the particle thermostat, and another chain of
+thermostats coupled to the barostat variables.  The barostat can be
+coupled to the overall box volume, or to individual dimensions,
+including the *xy*, *xz* and *yz* tilt dimensions.  The external
+pressure of the barostat can be specified as either a scalar pressure
+(isobaric ensemble) or as components of a symmetric stress tensor
+(constant stress ensemble).  When used correctly, the time-averaged
+temperature and stress tensor of the particles will match the target
+values specified by Tstart/Tstop and Pstart/Pstop.
 
 The equations of motion used are those of Shinoda et al in
-:ref:`(Shinoda) <nh-Shinoda>`, which combine the hydrostatic equations of
-Martyna, Tobias and Klein in :ref:`(Martyna) <nh-Martyna>` with the strain
-energy proposed by Parrinello and Rahman in
-:ref:`(Parrinello) <nh-Parrinello>`.  The time integration schemes closely
-follow the time-reversible measure-preserving Verlet and rRESPA
-integrators derived by Tuckerman et al in :ref:`(Tuckerman) <nh-Tuckerman>`.
+:ref:`(Shinoda) <nh-Shinoda>`, which combine the hydrostatic equations
+of Martyna, Tobias and Klein in :ref:`(Martyna) <nh-Martyna>` with the
+strain energy proposed by Parrinello and Rahman in :ref:`(Parrinello)
+<nh-Parrinello>`.  The time integration schemes closely follow the
+time-reversible measure-preserving Verlet and rRESPA integrators derived
+by Tuckerman et al in :ref:`(Tuckerman) <nh-Tuckerman>`.
 
 ----------
 
@@ -128,27 +126,28 @@ using the *temp* keyword.  Other thermostat-related keywords are
 *tchain*, *tloop* and *drag*, which are discussed below.
 
 The thermostat is applied to only the translational degrees of freedom
-for the particles.  The translational degrees of freedom can also have
-a bias velocity removed before thermostatting takes place; see the
-description below.  The desired temperature at each timestep is a
-ramped value during the run from *Tstart* to *Tstop*\ .  The *Tdamp*
-parameter is specified in time units and determines how rapidly the
-temperature is relaxed.  For example, a value of 10.0 means to relax
-the temperature in a timespan of (roughly) 10 time units (e.g. :math:`\tau`
-or fs or ps - see the :doc:`units <units>` command).  The atoms in the
-fix group are the only ones whose velocities and positions are updated
-by the velocity/position update portion of the integration.
+for the particles.  The translational degrees of freedom can also have a
+bias velocity removed before thermostatting takes place; see the
+description below.  The desired temperature at each timestep is a ramped
+value during the run from *Tstart* to *Tstop*\ .  The *Tdamp* parameter
+is specified in time units and determines how rapidly the temperature is
+relaxed.  For example, a value of 10.0 means to relax the temperature in
+a timespan of (roughly) 10 time units (e.g. :math:`\tau` or fs or ps -
+see the :doc:`units <units>` command).  The atoms in the fix group are
+the only ones whose velocities and positions are updated by the
+velocity/position update portion of the integration.
 
 .. note::
 
-   A Nose-Hoover thermostat will not work well for arbitrary values
-   of *Tdamp*\ .  If *Tdamp* is too small, the temperature can fluctuate
-   wildly; if it is too large, the temperature will take a very long time
-   to equilibrate.  A good choice for many models is a *Tdamp* of around
-   100 timesteps.  Note that this is NOT the same as 100 time units for
-   most :doc:`units <units>` settings. A simple way to ensure this, is
-   via using an :doc:`immediate variable <variable>` expression accessing
-   the thermo property 'dt', which is the length of the time step. Example:
+   A Nose-Hoover thermostat will not work well for arbitrary values of
+   *Tdamp*\ .  If *Tdamp* is too small, the temperature can fluctuate
+   wildly; if it is too large, the temperature will take a very long
+   time to equilibrate.  A good choice for many models is a *Tdamp* of
+   around 100 timesteps.  Note that this is NOT the same as 100 time
+   units for most :doc:`units <units>` settings.  A simple way to ensure
+   this, is via using an :doc:`immediate variable <variable>` expression
+   accessing the thermo property 'dt', which is the length of the time
+   step.  Example:
 
 .. code-block:: LAMMPS
 
@@ -158,32 +157,32 @@ by the velocity/position update portion of the integration.
 
 The barostat parameters for fix styles *npt* and *nph* is specified
 using one or more of the *iso*, *aniso*, *tri*, *x*, *y*, *z*, *xy*,
-*xz*, *yz*, and *couple* keywords.  These keywords give you the
-ability to specify all 6 components of an external stress tensor, and
-to couple various of these components together so that the dimensions
-they represent are varied together during a constant-pressure
-simulation.
+*xz*, *yz*, and *couple* keywords.  These keywords give you the ability
+to specify all 6 components of an external stress tensor, and to couple
+various of these components together so that the dimensions they
+represent are varied together during a constant-pressure simulation.
 
-Other barostat-related keywords are *pchain*, *mtk*, *ploop*,
-*nreset*, *drag*, and *dilate*, which are discussed below.
+Other barostat-related keywords are *pchain*, *mtk*, *ploop*, *nreset*,
+*drag*, and *dilate*, which are discussed below.
 
 Orthogonal simulation boxes have 3 adjustable dimensions (x,y,z).
-Triclinic (non-orthogonal) simulation boxes have 6 adjustable
-dimensions (x,y,z,xy,xz,yz).  The :doc:`create_box <create_box>`, :doc:`read data <read_data>`, and :doc:`read_restart <read_restart>` commands
-specify whether the simulation box is orthogonal or non-orthogonal
-(triclinic) and explain the meaning of the xy,xz,yz tilt factors.
+Triclinic (non-orthogonal) simulation boxes have 6 adjustable dimensions
+(x,y,z,xy,xz,yz).  The :doc:`create_box <create_box>`, :doc:`read data
+<read_data>`, and :doc:`read_restart <read_restart>` commands specify
+whether the simulation box is orthogonal or non-orthogonal (triclinic)
+and explain the meaning of the xy,xz,yz tilt factors.
 
 The target pressures for each of the 6 components of the stress tensor
 can be specified independently via the *x*, *y*, *z*, *xy*, *xz*, *yz*
-keywords, which correspond to the 6 simulation box dimensions.  For
-each component, the external pressure or tensor component at each
-timestep is a ramped value during the run from *Pstart* to *Pstop*\ .
-If a target pressure is specified for a component, then the
-corresponding box dimension will change during a simulation.  For
-example, if the *y* keyword is used, the y-box length will change.  If
-the *xy* keyword is used, the xy tilt factor will change.  A box
-dimension will not change if that component is not specified, although
-you have the option to change that dimension via the :doc:`fix deform <fix_deform>` command.
+keywords, which correspond to the 6 simulation box dimensions.  For each
+component, the external pressure or tensor component at each timestep is
+a ramped value during the run from *Pstart* to *Pstop*\ .  If a target
+pressure is specified for a component, then the corresponding box
+dimension will change during a simulation.  For example, if the *y*
+keyword is used, the y-box length will change.  If the *xy* keyword is
+used, the xy tilt factor will change.  A box dimension will not change
+if that component is not specified, although you have the option to
+change that dimension via the :doc:`fix deform <fix_deform>` command.
 
 Note that in order to use the *xy*, *xz*, or *yz* keywords, the
 simulation box must be triclinic, even if its initial tilt factors are
@@ -191,19 +190,19 @@ simulation box must be triclinic, even if its initial tilt factors are
 
 For all barostat keywords, the *Pdamp* parameter operates like the
 *Tdamp* parameter, determining the time scale on which pressure is
-relaxed.  For example, a value of 10.0 means to relax the pressure in
-a timespan of (roughly) 10 time units (e.g. :math:`\tau` or fs or ps
+relaxed.  For example, a value of 10.0 means to relax the pressure in a
+timespan of (roughly) 10 time units (e.g. :math:`\tau` or fs or ps
 - see the :doc:`units <units>` command).
 
 .. note::
 
-   A Nose-Hoover barostat will not work well for arbitrary values
-   of *Pdamp*\ .  If *Pdamp* is too small, the pressure and volume can
+   A Nose-Hoover barostat will not work well for arbitrary values of
+   *Pdamp*\ .  If *Pdamp* is too small, the pressure and volume can
    fluctuate wildly; if it is too large, the pressure will take a very
    long time to equilibrate.  A good choice for many models is a *Pdamp*
    of around 1000 timesteps.  However, note that *Pdamp* is specified in
-   time units, and that timesteps are NOT the same as time units for most
-   :doc:`units <units>` settings.
+   time units, and that timesteps are NOT the same as time units for
+   most :doc:`units <units>` settings.
 
 The relaxation rate of the barostat is set by its inertia :math:`W`:
 
@@ -211,51 +210,52 @@ The relaxation rate of the barostat is set by its inertia :math:`W`:
 
    W = (N + 1) k_B T_\mathrm{target} P_\mathrm{damp}^2
 
-where :math:`N` is the number of atoms, :math:`k_B` is the Boltzmann constant,
-and :math:`T_\mathrm{target}` is the target temperature of the barostat :ref:`(Martyna) <nh-Martyna>`.
-If a thermostat is defined, :math:`T_\mathrm{target}` is the target temperature
-of the thermostat. If a thermostat is not defined, :math:`T_\mathrm{target}`
-is set to the current temperature of the system when the barostat is initialized.
-If this temperature is too low the simulation will quit with an error.
-Note: in previous versions of LAMMPS, :math:`T_\mathrm{target}` would default to
-a value of 1.0 for *lj* units and 300.0 otherwise if the system had a temperature
-of exactly zero.
-
-If a thermostat is not specified by this fix, :math:`T_\mathrm{target}` can be
-manually specified using the *Ptemp* parameter. This may be useful if the
-barostat is initialized when the current temperature does not reflect the
-steady state temperature of the system. This keyword may also be useful in
-athermal simulations where the temperature is not well defined.
-
-Regardless of what atoms are in the fix group (the only atoms which
-are time integrated), a global pressure or stress tensor is computed
-for all atoms.  Similarly, when the size of the simulation box is
-changed, all atoms are re-scaled to new positions, unless the keyword
-*dilate* is specified with a *dilate-group-ID* for a group that
-represents a subset of the atoms.  This can be useful, for example, to
-leave the coordinates of atoms in a solid substrate unchanged and
-controlling the pressure of a surrounding fluid.  This option should
-be used with care, since it can be unphysical to dilate some atoms and
-not others, because it can introduce large, instantaneous
-displacements between a pair of atoms (one dilated, one not) that are
-far from the dilation origin.  Also note that for atoms not in the fix
-group, a separate time integration fix like :doc:`fix nve <fix_nve>` or
-:doc:`fix nvt <fix_nh>` can be used on them, independent of whether they
-are dilated or not.
+where :math:`N` is the number of atoms, :math:`k_B` is the Boltzmann
+constant, and :math:`T_\mathrm{target}` is the target temperature of the
+barostat :ref:`(Martyna) <nh-Martyna>`.  If a thermostat is defined,
+:math:`T_\mathrm{target}` is the target temperature of the thermostat.
+If a thermostat is not defined, :math:`T_\mathrm{target}` is set to the
+current temperature of the system when the barostat is initialized.  If
+this temperature is too low the simulation will quit with an error.
+Note: in previous versions of LAMMPS, :math:`T_\mathrm{target}` would
+default to a value of 1.0 for *lj* units and 300.0 otherwise if the
+system had a temperature of exactly zero.
+
+If a thermostat is not specified by this fix, :math:`T_\mathrm{target}`
+can be manually specified using the *Ptemp* parameter.  This may be
+useful if the barostat is initialized when the current temperature does
+not reflect the steady state temperature of the system.  This keyword
+may also be useful in athermal simulations where the temperature is not
+well defined.
+
+Regardless of what atoms are in the fix group (the only atoms which are
+time integrated), a global pressure or stress tensor is computed for all
+atoms.  Similarly, when the size of the simulation box is changed, all
+atoms are re-scaled to new positions, unless the keyword *dilate* is
+specified with a *dilate-group-ID* for a group that represents a subset
+of the atoms.  This can be useful, for example, to leave the coordinates
+of atoms in a solid substrate unchanged and controlling the pressure of
+a surrounding fluid.  This option should be used with care, since it can
+be unphysical to dilate some atoms and not others, because it can
+introduce large, instantaneous displacements between a pair of atoms
+(one dilated, one not) that are far from the dilation origin.  Also note
+that for atoms not in the fix group, a separate time integration fix
+like :doc:`fix nve <fix_nve>` or :doc:`fix nvt <fix_nh>` can be used on
+them, independent of whether they are dilated or not.
 
 ----------
 
 The *couple* keyword allows two or three of the diagonal components of
-the pressure tensor to be "coupled" together.  The value specified
-with the keyword determines which are coupled.  For example, *xz*
-means the *Pxx* and *Pzz* components of the stress tensor are coupled.
-*Xyz* means all 3 diagonal components are coupled.  Coupling means two
-things: the instantaneous stress will be computed as an average of the
+the pressure tensor to be "coupled" together.  The value specified with
+the keyword determines which are coupled.  For example, *xz* means the
+*Pxx* and *Pzz* components of the stress tensor are coupled.  *Xyz*
+means all 3 diagonal components are coupled.  Coupling means two things:
+the instantaneous stress will be computed as an average of the
 corresponding diagonal components, and the coupled box dimensions will
 be changed together in lockstep, meaning coupled dimensions will be
 dilated or contracted by the same percentage every timestep.  The
-*Pstart*, *Pstop*, *Pdamp* parameters for any coupled dimensions must
-be identical.  *Couple xyz* can be used for a 2d simulation; the *z*
+*Pstart*, *Pstop*, *Pdamp* parameters for any coupled dimensions must be
+identical.  *Couple xyz* can be used for a 2d simulation; the *z*
 dimension is simply ignored.
 
 ----------
@@ -276,8 +276,8 @@ specifying these 4 keywords:
    couple xyz
 
 The keyword *aniso* means *x*, *y*, and *z* dimensions are controlled
-independently using the *Pxx*, *Pyy*, and *Pzz* components of the
-stress tensor as the driving forces, and the specified scalar external
+independently using the *Pxx*, *Pyy*, and *Pzz* components of the stress
+tensor as the driving forces, and the specified scalar external
 pressure.  Using "aniso Pstart Pstop Pdamp" is the same as specifying
 these 4 keywords:
 
@@ -289,10 +289,10 @@ these 4 keywords:
    couple none
 
 The keyword *tri* means *x*, *y*, *z*, *xy*, *xz*, and *yz* dimensions
-are controlled independently using their individual stress components
-as the driving forces, and the specified scalar pressure as the
-external normal stress.  Using "tri Pstart Pstop Pdamp" is the same as
-specifying these 7 keywords:
+are controlled independently using their individual stress components as
+the driving forces, and the specified scalar pressure as the external
+normal stress.  Using "tri Pstart Pstop Pdamp" is the same as specifying
+these 7 keywords:
 
 .. parsed-literal::
 
@@ -306,116 +306,117 @@ specifying these 7 keywords:
 
 ----------
 
-In some cases (e.g. for solids) the pressure (volume) and/or
-temperature of the system can oscillate undesirably when a Nose/Hoover
-barostat and thermostat is applied.  The optional *drag* keyword will
-damp these oscillations, although it alters the Nose/Hoover equations.
-A value of 0.0 (no drag) leaves the Nose/Hoover formalism unchanged.
-A non-zero value adds a drag term; the larger the value specified, the
-greater the damping effect.  Performing a short run and monitoring the
-pressure and temperature is the best way to determine if the drag term
-is working.  Typically a value between 0.2 to 2.0 is sufficient to
-damp oscillations after a few periods. Note that use of the drag
-keyword will interfere with energy conservation and will also change
-the distribution of positions and velocities so that they do not
-correspond to the nominal NVT, NPT, or NPH ensembles.
+In some cases (e.g. for solids) the pressure (volume) and/or temperature
+of the system can oscillate undesirably when a Nose/Hoover barostat and
+thermostat is applied.  The optional *drag* keyword will damp these
+oscillations, although it alters the Nose/Hoover equations.  A value of
+0.0 (no drag) leaves the Nose/Hoover formalism unchanged.  A non-zero
+value adds a drag term; the larger the value specified, the greater the
+damping effect.  Performing a short run and monitoring the pressure and
+temperature is the best way to determine if the drag term is working.
+Typically a value between 0.2 to 2.0 is sufficient to damp oscillations
+after a few periods.  Note that use of the drag keyword will interfere
+with energy conservation and will also change the distribution of
+positions and velocities so that they do not correspond to the nominal
+NVT, NPT, or NPH ensembles.
 
 An alternative way to control initial oscillations is to use chain
-thermostats. The keyword *tchain* determines the number of thermostats
-in the particle thermostat. A value of 1 corresponds to the original
-Nose-Hoover thermostat. The keyword *pchain* specifies the number of
-thermostats in the chain thermostatting the barostat degrees of
-freedom. A value of 0 corresponds to no thermostatting of the
-barostat variables.
+thermostats.  The keyword *tchain* determines the number of thermostats
+in the particle thermostat.  A value of 1 corresponds to the original
+Nose-Hoover thermostat.  The keyword *pchain* specifies the number of
+thermostats in the chain thermostatting the barostat degrees of freedom.
+A value of 0 corresponds to no thermostatting of the barostat variables.
 
 The *mtk* keyword controls whether or not the correction terms due to
 Martyna, Tuckerman, and Klein are included in the equations of motion
 :ref:`(Martyna) <nh-Martyna>`.  Specifying *no* reproduces the original
-Hoover barostat, whose volume probability distribution function
-differs from the true NPT and NPH ensembles by a factor of 1/V.  Hence
-using *yes* is more correct, but in many cases the difference is
-negligible.
+Hoover barostat, whose volume probability distribution function differs
+from the true NPT and NPH ensembles by a factor of 1/V.  Hence using
+*yes* is more correct, but in many cases the difference is negligible.
 
 The keyword *tloop* can be used to improve the accuracy of integration
 scheme at little extra cost.  The initial and final updates of the
 thermostat variables are broken up into *tloop* sub-steps, each of
-length *dt*\ /\ *tloop*\ . This corresponds to using a first-order
-Suzuki-Yoshida scheme :ref:`(Tuckerman) <nh-Tuckerman>`.  The keyword *ploop*
-does the same thing for the barostat thermostat.
-
-The keyword *nreset* controls how often the reference dimensions used
-to define the strain energy are reset.  If this keyword is not used,
-or is given a value of zero, then the reference dimensions are set to
-those of the initial simulation domain and are never changed. If the
-simulation domain changes significantly during the simulation, then
-the final average pressure tensor will differ significantly from the
-specified values of the external stress tensor.  A value of *nstep*
-means that every *nstep* timesteps, the reference dimensions are set
-to those of the current simulation domain.
-
-The *scaleyz*, *scalexz*, and *scalexy* keywords control whether or
-not the corresponding tilt factors are scaled with the associated box
+length *dt*\ /\ *tloop*\ .  This corresponds to using a first-order
+Suzuki-Yoshida scheme :ref:`(Tuckerman) <nh-Tuckerman>`.  The keyword
+*ploop* does the same thing for the barostat thermostat.
+
+The keyword *nreset* controls how often the reference dimensions used to
+define the strain energy are reset.  If this keyword is not used, or is
+given a value of zero, then the reference dimensions are set to those of
+the initial simulation domain and are never changed.  If the simulation
+domain changes significantly during the simulation, then the final
+average pressure tensor will differ significantly from the specified
+values of the external stress tensor.  A value of *nstep* means that
+every *nstep* timesteps, the reference dimensions are set to those of
+the current simulation domain.
+
+The *scaleyz*, *scalexz*, and *scalexy* keywords control whether or not
+the corresponding tilt factors are scaled with the associated box
 dimensions when barostatting triclinic periodic cells.  The default
 values *yes* will turn on scaling, which corresponds to adjusting the
-linear dimensions of the cell while preserving its shape.  Choosing
-*no* ensures that the tilt factors are not scaled with the box
-dimensions. See below for restrictions and default values in different
-situations. In older versions of LAMMPS, scaling of tilt factors was
-not performed. The old behavior can be recovered by setting all three
-scale keywords to *no*\ .
-
-The *flip* keyword allows the tilt factors for a triclinic box to
-exceed half the distance of the parallel box length, as discussed
-below.  If the *flip* value is set to *yes*, the bound is enforced by
-flipping the box when it is exceeded.  If the *flip* value is set to
-*no*, the tilt will continue to change without flipping.  Note that if
-applied stress induces large deformations (e.g. in a liquid), this
-means the box shape can tilt dramatically and LAMMPS will run less
-efficiently, due to the large volume of communication needed to
-acquire ghost atoms around a processor's irregular-shaped subdomain.
-For extreme values of tilt, LAMMPS may also lose atoms and generate an
-error.
-
-The *isochoric* keyword allows to maintain constant volume when barostating
-up to two dimensions with this fix. The values following the isochoric keyword indicates the
-dimensions to use in that regard: "x" indicates the x dimension, "yz" (no
-space) indicates the y and z dimensions, etc. The selected dimensions are scaled to
-compensate the strain induced by the barostat and keep the system at a constant volume
-(or area in 2d). It is not possible to use this keyword if all the
-dimensions are coupled to barostats. In the case of 2d simulations, only x and
-y dimensions can be used to maintain a constant plane area. If you want to perform
-strain with constant volume, the :doc:`fix deform <fix_deform>` command using
-*volume* keyword is more likely to suit your needs.
+linear dimensions of the cell while preserving its shape.  Choosing *no*
+ensures that the tilt factors are not scaled with the box dimensions.
+See below for restrictions and default values in different situations.
+In older versions of LAMMPS, scaling of tilt factors was not performed.
+The old behavior can be recovered by setting all three scale keywords to
+*no*\ .
+
+The *flip* keyword allows the tilt factors for a triclinic box to exceed
+half the distance of the parallel box length, as discussed below.  If
+the *flip* value is set to *yes*, the bound is enforced by flipping the
+box when it is exceeded.  If the *flip* value is set to *no*, the tilt
+will continue to change without flipping.  Note that if applied stress
+induces large deformations (e.g. in a liquid), this means the box shape
+can tilt dramatically and LAMMPS will run less efficiently, due to the
+large volume of communication needed to acquire ghost atoms around a
+processor's irregular-shaped subdomain.  For extreme values of tilt,
+LAMMPS may also lose atoms and generate an error.
+
+.. versionadded:: TBD
+
+The *isochoric* keyword allows to maintain constant volume when
+barostatting up to two dimensions with this fix.  The values following
+the isochoric keyword indicates the dimensions to use in that regard:
+"x" indicates the x dimension, "yz" (no space) indicates the y and z
+dimensions, etc.  The selected dimensions are scaled to compensate the
+strain induced by the barostat and keep the system at a constant volume
+(or area in 2d).  It is not possible to use this keyword if *all*
+dimensions are coupled to a barostat.  In the case of 2d simulations,
+only x and y dimensions can be used to maintain a constant plane area.
+If you want to perform strain with constant volume, the :doc:`fix deform
+<fix_deform>` command using *volume* keyword is more likely to suit your
+needs.
 
 .. note::
-   If large strains are caused by the barostat because the initial configuration
-   is far from pressure equilibrium or equilibrated too fast, the system will
-   see large strains on the other dimensions as well. It is recommended to
-   perform preliminary NPT equilibration if necessary using standard NPT
-   simulations.
+
+   If large strains are caused by the barostat because the initial
+   configuration is far from pressure equilibrium or equilibrated too
+   fast, the system will see large strains on the other dimensions as
+   well.  It is recommended to perform preliminary NPT equilibration if
+   necessary using standard NPT simulations.
 
 The *fixedpoint* keyword specifies the fixed point for barostat volume
-changes. By default, it is the center of the box.  Whatever point is
+changes.  By default, it is the center of the box.  Whatever point is
 chosen will not move during the simulation.  For example, if the lower
-periodic boundaries pass through (0,0,0), and this point is provided
-to *fixedpoint*, then the lower periodic boundaries will remain at
-(0,0,0), while the upper periodic boundaries will move twice as
-far. In all cases, the particle trajectories are unaffected by the
-chosen value, except for a time-dependent constant translation of
-positions.
+periodic boundaries pass through (0,0,0), and this point is provided to
+*fixedpoint*, then the lower periodic boundaries will remain at (0,0,0),
+while the upper periodic boundaries will move twice as far.  In all
+cases, the particle trajectories are unaffected by the chosen value,
+except for a time-dependent constant translation of positions.
 
 If the *update* keyword is used with the *dipole* value, then the
-orientation of the dipole moment of each particle is also updated
-during the time integration.  This option should be used for models
-where a dipole moment is assigned to finite-size particles,
-e.g. spheroids via use of the :doc:`atom_style hybrid sphere dipole <atom_style>` command.
+orientation of the dipole moment of each particle is also updated during
+the time integration.  This option should be used for models where a
+dipole moment is assigned to finite-size particles, e.g. spheroids via
+use of the :doc:`atom_style hybrid sphere dipole <atom_style>` command.
 
 The default dipole orientation integrator can be changed to the
-Dullweber-Leimkuhler-McLachlan integration scheme
-:ref:`(Dullweber) <nh-Dullweber>` when using *update* with the value
-*dipole/dlm*\ . This integrator is symplectic and time-reversible,
-giving better energy conservation and allows slightly longer timesteps
-at only a small additional computational cost.
+Dullweber-Leimkuhler-McLachlan integration scheme :ref:`(Dullweber)
+<nh-Dullweber>` when using *update* with the value *dipole/dlm*\ .  This
+integrator is symplectic and time-reversible, giving better energy
+conservation and allows slightly longer timesteps at only a small
+additional computational cost.
 
 ----------
 
@@ -424,47 +425,47 @@ at only a small additional computational cost.
    Using a barostat coupled to tilt dimensions *xy*, *xz*, *yz* can
    sometimes result in arbitrarily large values of the tilt dimensions,
    i.e. a dramatically deformed simulation box.  LAMMPS allows the tilt
-   factors to grow a small amount beyond the normal limit of half the box
-   length (0.6 times the box length), and then performs a box "flip" to
-   an equivalent periodic cell.  See the discussion of the *flip* keyword
-   above, to allow this bound to be exceeded, if desired.
+   factors to grow a small amount beyond the normal limit of half the
+   box length (0.6 times the box length), and then performs a box "flip"
+   to an equivalent periodic cell.  See the discussion of the *flip*
+   keyword above, to allow this bound to be exceeded, if desired.
 
-The flip operation is described in more detail in the page for
-:doc:`fix deform <fix_deform>`.  Both the barostat dynamics and the atom
+The flip operation is described in more detail in the page for :doc:`fix
+deform <fix_deform>`.  Both the barostat dynamics and the atom
 trajectories are unaffected by this operation.  However, if a tilt
-factor is incremented by a large amount (1.5 times the box length) on
-a single timestep, LAMMPS can not accommodate this event and will
-terminate the simulation with an error. This error typically indicates
+factor is incremented by a large amount (1.5 times the box length) on a
+single timestep, LAMMPS can not accommodate this event and will
+terminate the simulation with an error.  This error typically indicates
 that there is something badly wrong with how the simulation was
-constructed, such as specifying values of *Pstart* that are too far
-from the current stress value, or specifying a timestep that is too
-large. Triclinic barostatting should be used with care. This also is
-true for other barostat styles, although they tend to be more
-forgiving of insults. In particular, it is important to recognize that
-equilibrium liquids can not support a shear stress and that
-equilibrium solids can not support shear stresses that exceed the
-yield stress.
+constructed, such as specifying values of *Pstart* that are too far from
+the current stress value, or specifying a timestep that is too large.
+Triclinic barostatting should be used with care.  This also is true for
+other barostat styles, although they tend to be more forgiving of
+insults.  In particular, it is important to recognize that equilibrium
+liquids can not support a shear stress and that equilibrium solids can
+not support shear stresses that exceed the yield stress.
 
 One exception to this rule is if the first dimension in the tilt factor
-(x for xy) is non-periodic.  In that case, the limits on the tilt
-factor are not enforced, since flipping the box in that dimension does
-not change the atom positions due to non-periodicity.  In this mode,
-if you tilt the system to extreme angles, the simulation will simply
-become inefficient due to the highly skewed simulation box.
+(x for xy) is non-periodic.  In that case, the limits on the tilt factor
+are not enforced, since flipping the box in that dimension does not
+change the atom positions due to non-periodicity.  In this mode, if you
+tilt the system to extreme angles, the simulation will simply become
+inefficient due to the highly skewed simulation box.
 
 .. note::
 
    Unlike the :doc:`fix temp/berendsen <fix_temp_berendsen>` command
    which performs thermostatting but NO time integration, these fixes
    perform thermostatting/barostatting AND time integration.  Thus you
-   should not use any other time integration fix, such as :doc:`fix nve <fix_nve>` on atoms to which this fix is applied.  Likewise,
-   fix nvt and fix npt should not normally be used on atoms that also
-   have their temperature controlled by another fix - e.g. by :doc:`fix langevin <fix_nh>` or :doc:`fix temp/rescale <fix_temp_rescale>`
-   commands.
+   should not use any other time integration fix, such as :doc:`fix nve
+   <fix_nve>` on atoms to which this fix is applied.  Likewise, fix nvt
+   and fix npt should not normally be used on atoms that also have their
+   temperature controlled by another fix - e.g. by :doc:`fix langevin
+   <fix_nh>` or :doc:`fix temp/rescale <fix_temp_rescale>` commands.
 
-See the :doc:`Howto thermostat <Howto_thermostat>` and :doc:`Howto barostat <Howto_barostat>` doc pages for a discussion of different
-ways to compute temperature and perform thermostatting and
-barostatting.
+See the :doc:`Howto thermostat <Howto_thermostat>` and :doc:`Howto
+barostat <Howto_barostat>` doc pages for a discussion of different ways
+to compute temperature and perform thermostatting and barostatting.
 
 ----------
 
@@ -488,45 +489,46 @@ For fix npt and fix nph:
 
 For fix nvt, the group for the new temperature compute is the same as
 the fix group.  For fix npt and fix nph, the group for both the new
-temperature and pressure compute is "all" since pressure is computed
-for the entire system.  In the case of fix nph, the temperature
-compute is not used for thermostatting, but just for a kinetic-energy
-contribution to the pressure.  See the :doc:`compute temp <compute_temp>` and :doc:`compute pressure <compute_pressure>`
-commands for details.  Note that the IDs of the new computes are the
-fix-ID + underscore + "temp" or fix_ID + underscore + "press".
+temperature and pressure compute is "all" since pressure is computed for
+the entire system.  In the case of fix nph, the temperature compute is
+not used for thermostatting, but just for a kinetic-energy contribution
+to the pressure.  See the :doc:`compute temp <compute_temp>` and
+:doc:`compute pressure <compute_pressure>` commands for details.  Note
+that the IDs of the new computes are the fix-ID + underscore + "temp" or
+fix_ID + underscore + "press".
 
 Note that these are NOT the computes used by thermodynamic output (see
 the :doc:`thermo_style <thermo_style>` command) with ID = *thermo_temp*
 and *thermo_press*.  This means you can change the attributes of these
-fix's temperature or pressure via the
-:doc:`compute_modify <compute_modify>` command.  Or you can print this
-temperature or pressure during thermodynamic output via the
-:doc:`thermo_style custom <thermo_style>` command using the appropriate
-compute-ID.  It also means that changing attributes of *thermo_temp*
-or *thermo_press* will have no effect on this fix.
-
-Like other fixes that perform thermostatting, this fix can be used
-with :doc:`compute commands <compute>` that remove a "bias" from the
-atom velocities.  E.g. to apply the thermostat only to atoms within a
-spatial :doc:`region <region>`, or to remove the center-of-mass
-velocity from a group of atoms, or to remove the x-component of
-velocity from the calculation.
+fix's temperature or pressure via the :doc:`compute_modify
+<compute_modify>` command.  Or you can print this temperature or
+pressure during thermodynamic output via the :doc:`thermo_style custom
+<thermo_style>` command using the appropriate compute-ID.  It also means
+that changing attributes of *thermo_temp* or *thermo_press* will have no
+effect on this fix.
+
+Like other fixes that perform thermostatting, this fix can be used with
+:doc:`compute commands <compute>` that remove a "bias" from the atom
+velocities.  E.g. to apply the thermostat only to atoms within a spatial
+:doc:`region <region>`, or to remove the center-of-mass velocity from a
+group of atoms, or to remove the x-component of velocity from the
+calculation.
 
 This is not done by default, but only if the :doc:`fix_modify
 <fix_modify>` command is used to assign a temperature compute to this
 fix that includes such a bias term.  See the doc pages for individual
-:doc:`compute temp commands <compute>` to determine which ones include
-a bias.  In this case, the thermostat works in the following manner:
-bias is removed from each atom, thermostatting is performed on the
-remaining thermal degrees of freedom, and the bias is added back in.
+:doc:`compute temp commands <compute>` to determine which ones include a
+bias.  In this case, the thermostat works in the following manner: bias
+is removed from each atom, thermostatting is performed on the remaining
+thermal degrees of freedom, and the bias is added back in.
 
 ----------
 
 These fixes can be used with either the *verlet* or *respa*
-:doc:`integrators <run_style>`. When using one of the barostat fixes
-with *respa*, LAMMPS uses an integrator constructed
-according to the following factorization of the Liouville propagator
-(for two rRESPA levels):
+:doc:`integrators <run_style>`.  When using one of the barostat fixes
+with *respa*, LAMMPS uses an integrator constructed according to the
+following factorization of the Liouville propagator (for two rRESPA
+levels):
 
 .. math::
 
@@ -550,35 +552,36 @@ according to the following factorization of the Liouville propagator
    &+ \mathcal{O} \left(\Delta t^3 \right)
 
 This factorization differs somewhat from that of Tuckerman et al, in
-that the barostat is only updated at the outermost rRESPA level,
-whereas Tuckerman's factorization requires splitting the pressure into
-pieces corresponding to the forces computed at each rRESPA level. In
-theory, the latter method will exhibit better numerical stability. In
-practice, because Pdamp is normally chosen to be a large multiple of
-the outermost rRESPA timestep, the barostat dynamics are not the
-limiting factor for numerical stability. Both factorizations are
-time-reversible and can be shown to preserve the phase space measure
-of the underlying non-Hamiltonian equations of motion.
+that the barostat is only updated at the outermost rRESPA level, whereas
+Tuckerman's factorization requires splitting the pressure into pieces
+corresponding to the forces computed at each rRESPA level.  In theory,
+the latter method will exhibit better numerical stability.  In practice,
+because Pdamp is normally chosen to be a large multiple of the outermost
+rRESPA timestep, the barostat dynamics are not the limiting factor for
+numerical stability.  Both factorizations are time-reversible and can be
+shown to preserve the phase space measure of the underlying
+non-Hamiltonian equations of motion.
 
 .. note::
 
-   This implementation has been shown to conserve linear momentum
-   up to machine precision under NVT dynamics. Under NPT dynamics,
-   for a system with zero initial total linear momentum, the total
-   momentum fluctuates close to zero. It may occasionally undergo brief
+   This implementation has been shown to conserve linear momentum up to
+   machine precision under NVT dynamics.  Under NPT dynamics, for a
+   system with zero initial total linear momentum, the total momentum
+   fluctuates close to zero.  It may occasionally undergo brief
    excursions to non-negligible values, before returning close to zero.
-   Over long simulations, this has the effect of causing the center-of-mass
-   to undergo a slow random walk. This can be mitigated by resetting
-   the momentum at infrequent intervals using the
-   :doc:`fix momentum <fix_momentum>` command.
+   Over long simulations, this has the effect of causing the
+   center-of-mass to undergo a slow random walk.  This can be mitigated
+   by resetting the momentum at infrequent intervals using the :doc:`fix
+   momentum <fix_momentum>` command.
 
 ----------
 
 The fix npt and fix nph commands can be used with rigid bodies or
 mixtures of rigid bodies and non-rigid particles (e.g. solvent).  But
-there are also :doc:`fix rigid/npt <fix_rigid>` and :doc:`fix rigid/nph <fix_rigid>` commands, which are typically a more natural
-choice.  See the page for those commands for more discussion of
-the various ways to do this.
+there are also :doc:`fix rigid/npt <fix_rigid>` and :doc:`fix rigid/nph
+<fix_rigid>` commands, which are typically a more natural choice.  See
+the page for those commands for more discussion of the various ways to
+do this.
 
 ----------
 
@@ -596,25 +599,26 @@ a fix in an input script that reads a restart file, so that the
 operation of the fix continues in an uninterrupted fashion.
 
 The :doc:`fix_modify <fix_modify>` *temp* and *press* options are
-supported by these fixes.  You can use them to assign a
-:doc:`compute <compute>` you have defined to this fix which will be used
-in its thermostatting or barostatting procedure, as described above.
-If you do this, note that the kinetic energy derived from the compute
-temperature should be consistent with the virial term computed using
-all atoms for the pressure.  LAMMPS will warn you if you choose to
-compute temperature on a subset of atoms.
+supported by these fixes.  You can use them to assign a :doc:`compute
+<compute>` you have defined to this fix which will be used in its
+thermostatting or barostatting procedure, as described above.  If you do
+this, note that the kinetic energy derived from the compute temperature
+should be consistent with the virial term computed using all atoms for
+the pressure.  LAMMPS will warn you if you choose to compute temperature
+on a subset of atoms.
 
 .. note::
 
    If both the *temp* and *press* keywords are used in a single
-   thermo_modify command (or in two separate commands), then the order in
-   which the keywords are specified is important.  Note that a :doc:`pressure compute <compute_pressure>` defines its own temperature compute as
-   an argument when it is specified.  The *temp* keyword will override
-   this (for the pressure compute being used by fix npt), but only if the
-   *temp* keyword comes after the *press* keyword.  If the *temp* keyword
-   comes before the *press* keyword, then the new pressure compute
-   specified by the *press* keyword will be unaffected by the *temp*
-   setting.
+   thermo_modify command (or in two separate commands), then the order
+   in which the keywords are specified is important.  Note that a
+   :doc:`pressure compute <compute_pressure>` defines its own
+   temperature compute as an argument when it is specified.  The *temp*
+   keyword will override this (for the pressure compute being used by
+   fix npt), but only if the *temp* keyword comes after the *press*
+   keyword.  If the *temp* keyword comes before the *press* keyword,
+   then the new pressure compute specified by the *press* keyword will
+   be unaffected by the *temp* setting.
 
 The cumulative energy change in the system imposed by these fixes, via
 either thermostatting and/or barostatting, is included in the
@@ -632,16 +636,16 @@ can be accessed by various :doc:`output commands <Howto_output>`.  The
 vector values are "intensive".
 
 The vector stores internal Nose/Hoover thermostat and barostat
-variables.  The number and meaning of the vector values depends on
-which fix is used and the settings for keywords *tchain* and *pchain*,
-which specify the number of Nose/Hoover chains for the thermostat and
+variables.  The number and meaning of the vector values depends on which
+fix is used and the settings for keywords *tchain* and *pchain*, which
+specify the number of Nose/Hoover chains for the thermostat and
 barostat.  If no thermostatting is done, then *tchain* is 0.  If no
-barostatting is done, then *pchain* is 0.  In the following list,
-"ndof" is 0, 1, 3, or 6, and is the number of degrees of freedom in
-the barostat.  Its value is 0 if no barostat is used, else its value
-is 6 if any off-diagonal stress tensor component is barostatted, else
-its value is 1 if *couple xyz* is used or *couple xy* for a 2d
-simulation, otherwise its value is 3.
+barostatting is done, then *pchain* is 0.  In the following list, "ndof"
+is 0, 1, 3, or 6, and is the number of degrees of freedom in the
+barostat.  Its value is 0 if no barostat is used, else its value is 6 if
+any off-diagonal stress tensor component is barostatted, else its value
+is 1 if *couple xyz* is used or *couple xy* for a 2d simulation,
+otherwise its value is 3.
 
 The order of values in the global vector and their meaning is as
 follows.  The notation means there are tchain values for eta, followed
@@ -664,20 +668,22 @@ by tchain for eta_dot, followed by ndof for omega, etc:
 .. versionadded:: 10Dec2025
 
 This fix supports automatically generated thermo column names when using
-:doc:`thermo_modify colname auto <thermo_modify>`.  The thermo column names
-are "f\_", followed by the fix ID, followed by a colon, followed by a
-keyword listed above, followed by an index for that keyword.  Indices range
-from 1 to the number of values for that keyword.  E.g., the first example
-in the Examples section above would print a thermo column name of
-"f\_1:eta[1]", compared to the default column output name of "f\_1[1]".
-Similarly, "f\_1:eta_dot[1]" would be printed instead of the default "f\_1[4].
+:doc:`thermo_modify colname auto <thermo_modify>`.  The thermo column
+names are "f\_", followed by the fix ID, followed by a colon, followed
+by a keyword listed above, followed by an index for that keyword.
+Indices range from 1 to the number of values for that keyword.  E.g.,
+the first example in the Examples section above would print a thermo
+column name of "f\_1:eta[1]", compared to the default column output name
+of "f\_1[1]".  Similarly, "f\_1:eta_dot[1]" would be printed instead of
+the default "f\_1[4].
 
 These fixes can ramp their external temperature and pressure over
-multiple runs, using the *start* and *stop* keywords of the
-:doc:`run <run>` command.  See the :doc:`run <run>` command for details of
-how to do this.
+multiple runs, using the *start* and *stop* keywords of the :doc:`run
+<run>` command.  See the :doc:`run <run>` command for details of how to
+do this.
 
-These fixes are not invoked during :doc:`energy minimization <minimize>`.
+These fixes are not invoked during :doc:`energy minimization
+<minimize>`.
 
 ----------
 
@@ -691,26 +697,26 @@ simulation domain is triclinic and the second dimension in the keyword
 barostatted for 2D simulations.  The :doc:`create_box <create_box>`,
 :doc:`read data <read_data>`, and :doc:`read_restart <read_restart>`
 commands specify whether the simulation box is orthogonal or
-non-orthogonal (triclinic) and explain the meaning of the xy,xz,yz
-tilt factors.
+non-orthogonal (triclinic) and explain the meaning of the xy,xz,yz tilt
+factors.
 
 For the *temp* keyword, the final Tstop cannot be 0.0 since it would
 make the external T = 0.0 at some timestep during the simulation which
 is not allowed in the Nose/Hoover formulation.
 
 The *scaleyz yes* and *scalexz yes* keyword/value pairs can not be used
-for 2D simulations. *scaleyz yes*, *scalexz yes*, and *scalexy yes* options
-can only be used if the second dimension in the keyword is periodic,
-and if the tilt factor is not coupled to the barostat via keywords
-*tri*, *yz*, *xz*, and *xy*\ .
+for 2D simulations.  The *scaleyz yes*, *scalexz yes*, and *scalexy yes*
+options can only be used if the second dimension in the keyword is
+periodic, and if the tilt factor is not coupled to the barostat via
+keywords *tri*, *yz*, *xz*, and *xy*\ .
 
 These fixes can be used with dynamic groups as defined by the
 :doc:`group <group>` command.  Likewise they can be used with groups to
 which atoms are added or deleted over time, e.g. a deposition
-simulation.  However, the conservation properties of the thermostat
-and barostat are defined for systems with a static set of atoms.  You
-may observe odd behavior if the atoms in a group vary dramatically
-over time or the atom count becomes very small.
+simulation.  However, the conservation properties of the thermostat and
+barostat are defined for systems with a static set of atoms.  You may
+observe odd behavior if the atoms in a group vary dramatically over time
+or the atom count becomes very small.
 
 Related commands
 """"""""""""""""
@@ -721,10 +727,10 @@ Related commands
 Default
 """""""
 
-The keyword defaults are tchain = 3, pchain = 3, mtk = yes, tloop = 1, ploop =
-1, nreset = 0, drag = 0.0, dilate = all, couple = none, flip = yes, scaleyz =
-scalexz = scalexy = yes if periodic in second dimension and not coupled to
-barostat, otherwise no.
+The keyword defaults are tchain = 3, pchain = 3, mtk = yes, tloop = 1,
+ploop = 1, nreset = 0, drag = 0.0, dilate = all, couple = none, flip =
+yes, scaleyz = scalexz = scalexy = yes if periodic in second dimension
+and not coupled to barostat, otherwise no.
 
 ----------
 
diff --git a/doc/src/img/ellipsoid-level2.png b/doc/src/img/ellipsoid-level2.png
index ffddec7b489..92717de6fbc 100644
Binary files a/doc/src/img/ellipsoid-level2.png and b/doc/src/img/ellipsoid-level2.png differ
diff --git a/doc/src/img/ellipsoid-level4.png b/doc/src/img/ellipsoid-level4.png
index 860aedf2d08..bf7f7847423 100644
Binary files a/doc/src/img/ellipsoid-level4.png and b/doc/src/img/ellipsoid-level4.png differ
diff --git a/doc/src/img/ellipsoid-level6.png b/doc/src/img/ellipsoid-level6.png
index 6f0da143745..96304f4c2e8 100644
Binary files a/doc/src/img/ellipsoid-level6.png and b/doc/src/img/ellipsoid-level6.png differ
diff --git a/doc/src/img/ellipsoid-mesh.png b/doc/src/img/ellipsoid-mesh.png
new file mode 100644
index 00000000000..af91b05d754
Binary files /dev/null and b/doc/src/img/ellipsoid-mesh.png differ
diff --git a/doc/src/img/superellipsoids-bowl.png b/doc/src/img/superellipsoids-bowl.png
new file mode 100644
index 00000000000..510200218c3
Binary files /dev/null and b/doc/src/img/superellipsoids-bowl.png differ
diff --git a/doc/src/img/superellipsoids-drop.png b/doc/src/img/superellipsoids-drop.png
new file mode 100644
index 00000000000..d9e375e8f2d
Binary files /dev/null and b/doc/src/img/superellipsoids-drop.png differ
diff --git a/doc/src/img/superellipsoids-zoo.png b/doc/src/img/superellipsoids-zoo.png
new file mode 100644
index 00000000000..0e742c9058c
Binary files /dev/null and b/doc/src/img/superellipsoids-zoo.png differ
diff --git a/doc/src/package.rst b/doc/src/package.rst
index f8f7b86ddd1..f63de7c57bc 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -79,7 +79,7 @@ Syntax
         *no_affinity* values = none
     *kokkos* args = keyword value ...
       zero or more keyword/value pairs may be appended
-      keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *comm/fix/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only*
+      keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/compute/forward* or *comm/reverse* or *comm/pair/reverse* or *comm/fix/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only*
         *neigh* value = *full* or *half*
           full = full neighbor list
           half = half neighbor list built in thread-safe manner
@@ -98,11 +98,12 @@ Syntax
         *binsize* value = size
           size = bin size for neighbor list construction (distance units)
         *comm* value = *no* or *host* or *device*
-          use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/reverse and comm/fix/reverse
+          use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/compute/forward and comm/reverse and comm/fix/reverse
         *comm/exchange* value = *no* or *host* or *device*
         *comm/forward* value = *no* or *host* or *device*
         *comm/pair/forward* value = *no* or *device*
         *comm/fix/forward* value = *no* or *device*
+        *comm/compute/forward* value = *no* or *device*
         *comm/reverse* value = *no* or *host* or *device*
           *no* = perform communication pack/unpack in non-KOKKOS mode
           *host* = perform pack/unpack on host (e.g. with OpenMP threading)
@@ -548,9 +549,9 @@ because the GPU is faster at performing pairwise interactions, then this
 rule of thumb may give too large a binsize and the default should be
 overridden with a smaller value.
 
-The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward*
-and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* and
-*comm/fix/reverse*
+The *comm* and *comm/exchange* and *comm/forward* and
+*comm/pair/forward* and *comm/fix/forward* and *comm/compute/forward*
+and *comm/reverse* and *comm/pair/reverse* and *comm/fix/reverse*
 keywords determine whether the host or device performs the packing and
 unpacking of data when communicating per-atom data between processors.
 "Exchange" communication happens only on timesteps that neighbor lists
@@ -558,14 +559,15 @@ are rebuilt. The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep. "Reverse" communication
 happens every timestep if the *newton* option is on. The data is for
 atom coordinates and any other atom properties that needs to be updated
-for ghost atoms owned by each processor. "Pair/comm" controls additional
-communication in pair styles, such as pair_style EAM. "Fix/comm" controls
-additional communication in fixes, such as fix SHAKE.
+for ghost atoms owned by each processor. "Comm/pair" controls additional
+communication in pair styles, such as pair_style EAM. "Comm/fix" controls
+additional communication in fixes, such as fix SHAKE. Similarly,
+"comm/compute" controls additional communication in computes.
 
 The *comm* keyword is simply a short-cut to set the same value for all
 the comm keywords.
 
-The value options for the keywords are *no* or *host* or *device*\ . A
+The value options for the keywords are *no* or *host* or *device*. A
 value of *no* means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication. A value of *host* means to
 use the host, typically a multicore CPU, and perform the
@@ -573,15 +575,15 @@ packing/unpacking in parallel with threads. A value of *device* means to
 use the device, typically a GPU, to perform the packing/unpacking
 operation.
 
-For the *comm/pair/forward* or *comm/fix/forward* or *comm/pair/reverse*
-keywords, if a value of *host* is used it will be automatically
-be changed to *no* since these keywords don't support *host* mode. The
-value of *no* will also always be used when running on the CPU, i.e. setting
-the value to *device* will have no effect if the pair/fix style is
-running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* or
-*comm/fix/reverse*
-keywords, not all styles support *device* mode and in that case will run
-in *no* mode instead.
+For the *comm/pair/forward* or *comm/fix/forward* or
+*comm/compute/forward* or *comm/pair/reverse* keywords, if a value of
+*host* is used it will be automatically be changed to *no* since these
+keywords don't support *host* mode. The value of *no* will also always
+be used when running on the CPU, i.e. setting the value to *device*
+will have no effect if the pair/fix style is running on the CPU. For
+the *comm/fix/forward* or *comm/compute/forward* or
+*comm/pair/reverse* or *comm/fix/reverse* keywords, not all styles
+support *device* mode and in that case will run in *no* mode instead.
 
 The optimal choice for these keywords depends on the input script and
 the hardware used. The *no* value is useful for verifying that the
diff --git a/doc/src/pair_granular.rst b/doc/src/pair_granular.rst
index 82403fe5433..46f28282325 100644
--- a/doc/src/pair_granular.rst
+++ b/doc/src/pair_granular.rst
@@ -51,36 +51,35 @@ Description
 
 The *granular* styles support a variety of options for the normal,
 tangential, rolling and twisting forces resulting from contact between
-two granular particles. This expands on the options offered by the
-:doc:`pair gran/\* <pair_gran>` pair styles. The total computed forces
+two granular particles.  This expands on the options offered by the
+:doc:`pair gran/\* <pair_gran>` pair styles.  The total computed forces
 and torques are the sum of various models selected for the normal,
 tangential, rolling and twisting modes of motion.
 
-All model choices and parameters are entered in the
-:doc:`pair_coeff <pair_coeff>` command, as described below.  Unlike
-e.g. :doc:`pair gran/hooke <pair_gran>`, coefficient values are not
-global, but can be set to different values for different combinations
-of particle types, as determined by the :doc:`pair_coeff <pair_coeff>`
-command.  If the contact model choice is the same for two particle
-types, the mixing for the cross-coefficients can be carried out
-automatically. This is shown in one of the examples, where model
-choices are the same for type 1 - type 1 as for type 2 - type2
-interactions, but coefficients are different. In this case, the
-mixed coefficients for type 1 - type 2 interactions can be determined from
-mixing rules discussed below.  For additional flexibility,
-coefficients as well as model forms can vary between particle types,
-as shown in the fourth example: type 1 - type 1 interactions are based
-on a Johnson-Kendall-Roberts normal contact model and 2-2 interactions
-are based on a DMT cohesive model (see below).  In that example, 1-1
-and 2-2 interactions have different model forms, in which case mixing of
-coefficients cannot be determined, so 1-2 interactions must be
-explicitly defined via the *pair_coeff 1 \** command, otherwise an
-error would result.
+All model choices and parameters are entered in the :doc:`pair_coeff
+<pair_coeff>` command, as described below.  Unlike e.g. :doc:`pair
+gran/hooke <pair_gran>`, coefficient values are not global, but can be
+set to different values for different combinations of particle types, as
+determined by the :doc:`pair_coeff <pair_coeff>` command.  If the
+contact model choice is the same for two particle types, the mixing for
+the cross-coefficients can be carried out automatically.  This is shown
+in one of the examples, where model choices are the same for type 1 -
+type 1 as for type 2 - type2 interactions, but coefficients are
+different.  In this case, the mixed coefficients for type 1 - type 2
+interactions can be determined from mixing rules discussed below.  For
+additional flexibility, coefficients as well as model forms can vary
+between particle types, as shown in the fourth example: type 1 - type 1
+interactions are based on a Johnson-Kendall-Roberts normal contact model
+and 2-2 interactions are based on a DMT cohesive model (see below).  In
+that example, 1-1 and 2-2 interactions have different model forms, in
+which case mixing of coefficients cannot be determined, so 1-2
+interactions must be explicitly defined via the *pair_coeff 1 \**
+command, otherwise an error would result.
 
 ----------
 
 The first required keyword for the *pair_coeff* command is the normal
-contact model. Currently supported options for normal contact models
+contact model.  Currently supported options for normal contact models
 and their required arguments are:
 
 1. *hooke* : :math:`k_n`, :math:`\eta_{n0}` (or :math:`e`)
@@ -94,25 +93,26 @@ and their required arguments are:
 Here, :math:`k_n` is spring stiffness (with units that depend on model
 choice, see below); :math:`\eta_{n0}` is a damping prefactor (or, in its
 place a coefficient of restitution :math:`e`, depending on the choice of
-damping mode, see below); E is Young's modulus in units of
-*force*\ /\ *length*\ \^2, i.e. *pressure*\ ; :math:`\nu` is Poisson's ratio and
-:math:`\gamma` is a surface energy density, in units of
-*energy*\ /\ *length*\ \^2.
+damping mode, see below); E is Young's modulus in units of *force*\ /\
+*length*\ \^2, i.e. *pressure*\ ; :math:`\nu` is Poisson's ratio and
+:math:`\gamma` is a surface energy density, in units of *energy*\ /\
+*length*\ \^2.
 
-For the *hooke* model, the normal, elastic component of force acting
-on particle *i* due to contact with particle *j* is given by:
+For the *hooke* model, the normal, elastic component of force acting on
+particle *i* due to contact with particle *j* is given by:
 
 .. math::
 
    \mathbf{F}_{ne, Hooke} = k_n \delta_{ij} \mathbf{n}
 
-Where :math:`\delta_{ij} = R_i + R_j - \|\mathbf{r}_{ij}\|` is the particle
-overlap, :math:`R_i, R_j` are the particle radii, :math:`\mathbf{r}_{ij} =
-\mathbf{r}_i - \mathbf{r}_j` is the vector separating the two particle centers
-(note the i-j ordering so that :math:`\mathbf{F}_{ne}` is positive for repulsion),
-and :math:`\mathbf{n} = \frac{\mathbf{r}_{ij}}{\|\mathbf{r}_{ij}\|}`.  Therefore,
-for *hooke*, the units of the spring constant :math:`k_n` are *force*\ /\
-*distance*, or equivalently *mass*\ /*time\^2*.
+Where :math:`\delta_{ij} = R_i + R_j - \|\mathbf{r}_{ij}\|` is the
+particle overlap, :math:`R_i, R_j` are the particle radii,
+:math:`\mathbf{r}_{ij} = \mathbf{r}_i - \mathbf{r}_j` is the vector
+separating the two particle centers (note the i-j ordering so that
+:math:`\mathbf{F}_{ne}` is positive for repulsion), and
+:math:`\mathbf{n} = \frac{\mathbf{r}_{ij}}{\|\mathbf{r}_{ij}\|}`.
+Therefore, for *hooke*, the units of the spring constant :math:`k_n` are
+*force*\ /\ *distance*, or equivalently *mass*\ /*time\^2*.
 
 For the *hertz* model, the normal component of force is given by:
 
@@ -122,8 +122,8 @@ For the *hertz* model, the normal component of force is given by:
 
 Here, :math:`R_{eff} = R = \frac{R_i R_j}{R_i + R_j}` is the effective
 radius, denoted for simplicity as *R* from here on.  For *hertz*, the
-units of the spring constant :math:`k_n` are *force*\ /\ *length*\ \^2, or
-equivalently *pressure*\ .
+units of the spring constant :math:`k_n` are *force*\ /\ *length*\ \^2,
+or equivalently *pressure*\ .
 
 For the *hertz/material* model, the force is given by:
 
@@ -131,16 +131,17 @@ For the *hertz/material* model, the force is given by:
 
    \mathbf{F}_{ne, Hertz/material} = \frac{4}{3} E_{eff} R^{1/2}\delta_{ij}^{3/2} \mathbf{n}
 
-Here, :math:`E_{eff} = E = \left(\frac{1-\nu_i^2}{E_i} + \frac{1-\nu_j^2}{E_j}\right)^{-1}`
-is the effective Young's modulus, with :math:`\nu_i, \nu_j` the Poisson ratios
-of the particles of types *i* and *j*. :math:`E_{eff}` is denoted as *E* from here on.
-Note that if the elastic modulus and the shear modulus of the two particles are the
-same, the *hertz/material* model is equivalent to the *hertz* model with
-:math:`k_n = 4/3 E`
+Here, :math:`E_{eff} = E = \left(\frac{1-\nu_i^2}{E_i} +
+\frac{1-\nu_j^2}{E_j}\right)^{-1}` is the effective Young's modulus,
+with :math:`\nu_i, \nu_j` the Poisson ratios of the particles of types
+*i* and *j*.  :math:`E_{eff}` is denoted as *E* from here on.  Note that
+if the elastic modulus and the shear modulus of the two particles are
+the same, the *hertz/material* model is equivalent to the *hertz* model
+with :math:`k_n = 4/3 E`
 
-The *dmt* model corresponds to the
-:ref:`(Derjaguin-Muller-Toporov) <DMT1975>` cohesive model, where the force
-is simply Hertz with an additional attractive cohesion term:
+The *dmt* model corresponds to the :ref:`(Derjaguin-Muller-Toporov)
+<DMT1975>` cohesive model, where the force is simply Hertz with an
+additional attractive cohesion term:
 
 .. math::
 
@@ -153,144 +154,155 @@ where the force is computed as:
 
    \mathbf{F}_{ne, jkr} = \left(\frac{4Ea^3}{3R} - 2\pi a^2\sqrt{\frac{4\gamma E}{\pi a}}\right)\mathbf{n}
 
-Here, :math:`a` is the radius of the contact zone, related to the overlap
-:math:`\delta` according to:
+Here, :math:`a` is the radius of the contact zone, related to the
+overlap :math:`\delta` according to:
 
 .. math::
 
    \delta = a^2/R - 2\sqrt{\pi \gamma a/E}
 
 LAMMPS internally inverts the equation above to solve for *a* in terms
-of :math:`\delta`, then solves for the force in the previous
-equation. Additionally, note that the JKR model allows for a tensile
-force beyond contact (i.e. for :math:`\delta < 0`), up to a maximum of
-:math:`3\pi\gamma R` (also known as the 'pull-off' force).  Note that this
-is a hysteretic effect, where particles that are not contacting
+of :math:`\delta`, then solves for the force in the previous equation.
+Additionally, note that the JKR model allows for a tensile force beyond
+contact (i.e. for :math:`\delta < 0`), up to a maximum of
+:math:`3\pi\gamma R` (also known as the 'pull-off' force).  Note that
+this is a hysteretic effect, where particles that are not contacting
 initially will not experience force until they come into contact
 :math:`\delta \geq 0`; as they move apart and (:math:`\delta < 0`), they
-experience a tensile force up to :math:`3\pi\gamma R`, at which point they
-lose contact.
+experience a tensile force up to :math:`3\pi\gamma R`, at which point
+they lose contact.
 
 .. note::
 
-   Typically, neighbor lists are constructed for pair granular by testing
-   whether finite sized particles overlap (using their radii). However,
-   this is not the case for normal models which can interact beyond
-   contact, e.g. *jkr*. Instead, the maximum radius for each particle
-   type is first calculated then used to calculate a maximum per-type
-   cutoff distance. For polydisperse systems, this affects the performance
-   of the :doc:`multi neighbor <neigh_modify>` option where one should
-   assign atoms of similar radii the same type. See the
-   :doc:`pair lj/cut/sphere <pair_lj_cut_sphere>` page for a related discussion.
-
-The *mdr* model is a mechanically-derived contact model designed to capture the
-contact response between adhesive elastic-plastic particles into large deformation.
-The theoretical foundations of the *mdr* model are detailed in the
-two-part series :ref:`Zunker and Kamrin Part I <Zunker2024I>` and
-:ref:`Zunker and Kamrin Part II <Zunker2024II>`. Further development
-and demonstrations of its application to industrially relevant powder
-compaction processes are presented in :ref:`Zunker et al. <Zunker2025>`.
-If you use the *mdr* normal model the only supported damping option is
-the *mdr* damping class described below.
+   Typically, neighbor lists are constructed for pair granular by
+   testing whether finite sized particles overlap (using their radii).
+   However, this is not the case for normal models which can interact
+   beyond contact, e.g. *jkr*.  Instead, the maximum radius for each
+   particle type is first calculated then used to calculate a maximum
+   per-type cutoff distance.  For polydisperse systems, this affects the
+   performance of the :doc:`multi neighbor <neigh_modify>` option where
+   one should assign atoms of similar radii the same type.  See the
+   :doc:`pair lj/cut/sphere <pair_lj_cut_sphere>` page for a related
+   discussion.
+
+The *mdr* model is a mechanically-derived contact model designed to
+capture the contact response between adhesive elastic-plastic particles
+into large deformation.  The theoretical foundations of the *mdr* model
+are detailed in the two-part series :ref:`Zunker and Kamrin Part I
+<Zunker2024I>` and :ref:`Zunker and Kamrin Part II <Zunker2024II>`.
+Further development and demonstrations of its application to
+industrially relevant powder compaction processes are presented in
+:ref:`Zunker et al. <Zunker2025>`.  If you use the *mdr* normal model
+the only supported damping option is the *mdr* damping class described
+below.
 
 The model requires the following inputs:
 
-   1. *Young's modulus* :math:`E > 0` : The Young's modulus is commonly reported
-   for various powders.
-
-   2. *Poisson's ratio* :math:`0 \le \nu \le 0.5` : The Poisson's ratio is commonly
+   1. *Young's modulus* :math:`E > 0` : The Young's modulus is commonly
    reported for various powders.
 
-   3. *Yield stress* :math:`Y \ge 0` : The yield stress is often known for powders
-   composed of materials such as metals but may be unreported for ductile organic
-   materials, in which case it can be treated as a free parameter.
+   2. *Poisson's ratio* :math:`0 \le \nu \le 0.5` : The Poisson's ratio
+   is commonly reported for various powders.
+
+   3. *Yield stress* :math:`Y \ge 0` : The yield stress is often known
+   for powders composed of materials such as metals but may be
+   unreported for ductile organic materials, in which case it can be
+   treated as a free parameter.
 
-   4. *Effective surface energy* :math:`\Delta\gamma \ge 0` : The effective surface
-   energy for powder compaction applications is most easily determined through its
-   relation to the more commonly reported critical stress intensity factor
-   :math:`K_{Ic} = \sqrt{2\Delta\gamma E/(1-\nu^2)}`.
+   4. *Effective surface energy* :math:`\Delta\gamma \ge 0` : The
+   effective surface energy for powder compaction applications is most
+   easily determined through its relation to the more commonly reported
+   critical stress intensity factor :math:`K_{Ic} = \sqrt{2\Delta\gamma
+   E/(1-\nu^2)}`.
 
-   5. *Critical confinement ratio* :math:`0 \le \psi_b \le 1` : The critical confinement
-   ratio is a tunable parameter that determines when the bulk elastic response is
-   triggered. Lower values of :math:`\psi_b` delay the onset of the bulk elastic
-   response.
+   5. *Critical confinement ratio* :math:`0 \le \psi_b \le 1` : The
+   critical confinement ratio is a tunable parameter that determines
+   when the bulk elastic response is triggered.  Lower values of
+   :math:`\psi_b` delay the onset of the bulk elastic response.
 
-   6. *Damping coefficent* :math:`\eta_{n0} \ge 0` : The damping coefficient
-   is a tunable parameter that controls damping in the normal direction.
+   6. *Damping coefficent* :math:`\eta_{n0} \ge 0` : The damping
+   coefficient is a tunable parameter that controls damping in the
+   normal direction.
 
 .. note::
 
-   The values for :math:`E`, :math:`\nu`, :math:`Y`, and :math:`\Delta\gamma` (i.e.,
-   :math:`K_{Ic}`) should be selected for zero porosity to reflect the intrinsic
-   material property rather than the bulk powder property.
-
-The *mdr* model produces a nonlinear force-displacement response, therefore the
-critical timestep :math:`\Delta t` depends on the inputs and level of
-deformation. As a conservative starting point the timestep can be assumed to be
-dictated by the bulk elastic response such that
-:math:`\Delta t = 0.08\sqrt{m/k_\textrm{bulk}}`, where :math:`m` is the mass of
-the smallest particle and :math:`k_\textrm{bulk} = \kappa R_\textrm{min}` is an
-effective stiffness related to the bulk elastic response.
-Here, :math:`\kappa = E/(3(1-2\nu))` is the bulk modulus and
+   The values for :math:`E`, :math:`\nu`, :math:`Y`, and
+   :math:`\Delta\gamma` (i.e., :math:`K_{Ic}`) should be selected for
+   zero porosity to reflect the intrinsic material property rather than
+   the bulk powder property.
+
+The *mdr* model produces a nonlinear force-displacement response,
+therefore the critical timestep :math:`\Delta t` depends on the inputs
+and level of deformation.  As a conservative starting point the timestep
+can be assumed to be dictated by the bulk elastic response such that
+:math:`\Delta t = 0.08\sqrt{m/k_\textrm{bulk}}`, where :math:`m` is the
+mass of the smallest particle and :math:`k_\textrm{bulk} = \kappa
+R_\textrm{min}` is an effective stiffness related to the bulk elastic
+response.  Here, :math:`\kappa = E/(3(1-2\nu))` is the bulk modulus and
 :math:`R_\textrm{min}` is the radius of the smallest particle.
 
 The *atom_style* must be set to *sphere 1* to enable dynamic particle
-radii. The *mdr* model is designed to respect the incompressibility of
+radii.  The *mdr* model is designed to respect the incompressibility of
 plastic deformation and inherently tracks free surface displacements
-induced by all particle contacts. In practice, this means that all particles
-begin with an initial radius, however as compaction occurs and plastic
-deformation is accumulated, a new enlarged apparent radius is defined to
-ensure that that volume change due to plastic deformation is not lost.
-This apparent radius is stored as the *atom radius* meaning it is used
-for subsequent neighbor list builds and contact detection checks. The
-advantage of this is that multi-neighbor dependent effects such as
-formation of secondary contacts caused by radial expansion are captured
-by the *mdr* model. Setting *atom_style sphere 1* ensures that updates to
-the particle radii are properly reflected throughout the simulation.
+induced by all particle contacts.  In practice, this means that all
+particles begin with an initial radius, however as compaction occurs and
+plastic deformation is accumulated, a new enlarged apparent radius is
+defined to ensure that that volume change due to plastic deformation is
+not lost.  This apparent radius is stored as the *atom radius* meaning
+it is used for subsequent neighbor list builds and contact detection
+checks.  The advantage of this is that multi-neighbor dependent effects
+such as formation of secondary contacts caused by radial expansion are
+captured by the *mdr* model.  Setting *atom_style sphere 1* ensures that
+updates to the particle radii are properly reflected throughout the
+simulation.
 
 .. code-block:: LAMMPS
 
    atom_style sphere 1
 
-Newton's third law must be set to *off*. This ensures that the neighbor lists
-are constructed properly for the topological penalty algorithm used to screen
-for non-physical contacts occurring through obstructing particles, an issue
-prevalent under large deformation conditions. For more information on this
-algorithm see :ref:`Zunker et al. <Zunker2025>`.
+Newton's third law must be set to *off*.  This ensures that the neighbor
+lists are constructed properly for the topological penalty algorithm
+used to screen for non-physical contacts occurring through obstructing
+particles, an issue prevalent under large deformation conditions.  For
+more information on this algorithm see :ref:`Zunker et
+al. <Zunker2025>`.
 
 .. code-block:: LAMMPS
 
    newton off
 
-The definition of multiple *mdr* models in the *pair_style* is currently not
-supported. Similarly, the *mdr* model cannot be combined with a different normal
-model in the *pair_style*. Physically this means that only one homogeneous
-collection of particles governed by a single *mdr* model is allowed.
+The definition of multiple *mdr* models in the *pair_style* is currently
+not supported.  Similarly, the *mdr* model cannot be combined with a
+different normal model in the *pair_style*.  Physically this means that
+only one homogeneous collection of particles governed by a single *mdr*
+model is allowed.
 
-The *mdr* model currently only supports *fix wall/gran/region*, not
-*fix wall/gran*. If the *mdr* model is specified for the *pair_style*
-any *fix wall/gran/region* commands must also use the *mdr* model.
+The *mdr* model currently only supports *fix wall/gran/region*, not *fix
+wall/gran*.  If the *mdr* model is specified for the *pair_style* any
+*fix wall/gran/region* commands must also use the *mdr* model.
 Additionally, the following *mdr* inputs must match between the
 *pair_style* and *fix wall/gran/region* definitions: :math:`E`,
-:math:`\nu`, :math:`Y`, :math:`\psi_b`, and :math:`\eta_{n0}`. The exception
-is :math:`\Delta\gamma`, which may vary, permitting different
-adhesive behaviors between particle-particle and particle-wall interactions.
+:math:`\nu`, :math:`Y`, :math:`\psi_b`, and :math:`\eta_{n0}`.  The
+exception is :math:`\Delta\gamma`, which may vary, permitting different
+adhesive behaviors between particle-particle and particle-wall
+interactions.
 
 .. note::
 
-   The *mdr* model has a number of custom *property/atom* and *pair/local* definitions that
-   can be called in the input file. The useful properties for visualization
-   and analysis are described below.
+   The *mdr* model has a number of custom *property/atom* and
+   *pair/local* definitions that can be called in the input file. The
+   useful properties for visualization and analysis are described below.
 
 In addition to contact forces the *mdr* model also tracks the following
 quantities for each particle: elastic volume change, average normal
-stress components, total surface area involved in
-contact, and individual contact areas. In the input script, these quantities are
-initialized by calling *run 0* and can then be accessed using subsequent *compute*
-commands. The last *compute* command uses *pair/local p13* to calculate the pairwise
-contact areas for each active contact in the *group-ID*. Due to the use of an apparent
-radius in the *mdr* model, the keyword/arg pair *cutoff radius* must be specified for
-*pair/local* to properly detect existing contacts.
+stress components, total surface area involved in contact, and
+individual contact areas.  In the input script, these quantities are
+initialized by calling *run 0* and can then be accessed using subsequent
+*compute* commands.  The last *compute* command uses *pair/local p13* to
+calculate the pairwise contact areas for each active contact in the
+*group-ID*.  Due to the use of an apparent radius in the *mdr* model,
+the keyword/arg pair *cutoff radius* must be specified for *pair/local*
+to properly detect existing contacts.
 
 .. code-block:: LAMMPS
 
@@ -305,10 +317,10 @@ radius in the *mdr* model, the keyword/arg pair *cutoff radius* must be specifie
 .. note::
 
    The *mdr* model has two example input scripts within the
-   *examples/granular* directory. The first is a die compaction
-   simulation involving 200 particles named *in.tableting.200*.
-   The second is a triaxial compaction simulation involving 12
-   particles named *in.triaxial.compaction.12*.
+   *examples/granular* directory.  The first is a die compaction
+   simulation involving 200 particles named *in.tableting.200*.  The
+   second is a triaxial compaction simulation involving 12 particles
+   named *in.triaxial.compaction.12*.
 
 ----------
 
@@ -323,15 +335,15 @@ Here, :math:`\mathbf{v}_{n,rel} = (\mathbf{v}_j - \mathbf{v}_i) \cdot
 \mathbf{n}\ \mathbf{n}` is the component of relative velocity along
 :math:`\mathbf{n}`.
 
-The optional *damping* keyword to the *pair_coeff* command followed by
-a keyword determines the model form of the damping factor :math:`\eta_n`,
-and the interpretation of the :math:`\eta_{n0}` or :math:`e` coefficients
-specified as part of the normal contact model settings. The *damping*
-keyword and corresponding model form selection may be appended
-anywhere in the *pair coeff* command.  Note that the choice of damping
-model affects both the normal and tangential damping (and depending on
-other settings, potentially also the twisting damping).  The options
-for the damping model currently supported are:
+The optional *damping* keyword to the *pair_coeff* command followed by a
+keyword determines the model form of the damping factor :math:`\eta_n`,
+and the interpretation of the :math:`\eta_{n0}` or :math:`e`
+coefficients specified as part of the normal contact model settings.
+The *damping* keyword and corresponding model form selection may be
+appended anywhere in the *pair coeff* command.  Note that the choice of
+damping model affects both the normal and tangential damping (and
+depending on other settings, potentially also the twisting damping).
+The options for the damping model currently supported are:
 
 1. *velocity*
 2. *mass_velocity*
@@ -350,8 +362,8 @@ user-specified damping coefficient in the *normal* model:
 
    \eta_n = \eta_{n0}
 
-Here, :math:`\eta_{n0}` is the damping coefficient specified for the normal
-contact model, in units of *mass*\ /\ *time*\ .
+Here, :math:`\eta_{n0}` is the damping coefficient specified for the
+normal contact model, in units of *mass*\ /\ *time*\ .
 
 For *damping mass_velocity*, the normal damping is given by:
 
@@ -359,108 +371,114 @@ For *damping mass_velocity*, the normal damping is given by:
 
    \eta_n = \eta_{n0} m_{eff}
 
-Here, :math:`\eta_{n0}` is the damping coefficient specified for the normal
-contact model, in units of 1/\ *time* and
-:math:`m_{eff} = m_i m_j/(m_i + m_j)` is the effective mass.
-Use *damping mass_velocity* to reproduce the damping behavior of
-*pair gran/hooke/\**.
+Here, :math:`\eta_{n0}` is the damping coefficient specified for the
+normal contact model, in units of 1/\ *time* and :math:`m_{eff} = m_i
+m_j/(m_i + m_j)` is the effective mass.  Use *damping mass_velocity* to
+reproduce the damping behavior of *pair gran/hooke/\**.
 
-The *damping viscoelastic* model is based on the viscoelastic
-treatment of :ref:`(Brilliantov et al) <Brill1996>`, where the normal
-damping is given by:
+The *damping viscoelastic* model is based on the viscoelastic treatment
+of :ref:`(Brilliantov et al) <Brill1996_1>`, where the normal damping is
+given by:
 
 .. math::
 
    \eta_n = \eta_{n0}\ a m_{eff}
 
-Here, *a* is the contact radius, given by :math:`a =\sqrt{R\delta}`
-for all models except *jkr*, for which it is given implicitly according
-to :math:`\delta = a^2/R - 2\sqrt{\pi \gamma a/E}`.  For *damping viscoelastic*,
-:math:`\eta_{n0}` is in units of 1/(\ *time*\ \*\ *distance*\ ).
+Here, *a* is the contact radius, given by :math:`a =\sqrt{R\delta}` for
+all models except *jkr*, for which it is given implicitly according to
+:math:`\delta = a^2/R - 2\sqrt{\pi \gamma a/E}`.  For *damping
+viscoelastic*, :math:`\eta_{n0}` is in units of 1/(\ *time*\ \*\
+*distance*\ ).
 
-The *tsuji* model is based on the work of :ref:`(Tsuji et al) <Tsuji1992>`.
-Here, the damping coefficient specified as part of the normal model is interpreted
-as a restitution coefficient :math:`e`. The damping constant :math:`\eta_n` is
-given by:
+The *tsuji* model is based on the work of :ref:`(Tsuji et al)
+<Tsuji1992_1>`.  Here, the damping coefficient specified as part of the
+normal model is interpreted as a restitution coefficient :math:`e`.  The
+damping constant :math:`\eta_n` is given by:
 
 .. math::
 
    \eta_n = \alpha (m_{eff}k_{nd})^{1/2}
 
-where :math:`k_{nd}` is an effective harmonic stiffness equal to the ratio of
-the normal force to the overlap. For example, :math:`k_{nd} = 4/3Ea` for a
-Hertz contact model based on material parameters with :math:`a` being
-the contact radius of :math:`\sqrt{\delta R}`. For Hooke, :math:`k_{nd}`
-is simply the spring constant or :math:`k_{n}`. This damping model is not
-compatible with cohesive normal models such as *JKR* or *DMT*. The parameter
-:math:`\alpha` is related to the restitution coefficient *e* according to:
+where :math:`k_{nd}` is an effective harmonic stiffness equal to the
+ratio of the normal force to the overlap.  For example, :math:`k_{nd} =
+4/3Ea` for a Hertz contact model based on material parameters with
+:math:`a` being the contact radius of :math:`\sqrt{\delta R}`.  For
+Hooke, :math:`k_{nd}` is simply the spring constant or :math:`k_{n}`.
+This damping model is not compatible with cohesive normal models such as
+*JKR* or *DMT*.  The parameter :math:`\alpha` is related to the
+restitution coefficient *e* according to:
 
 .. math::
 
    \alpha = 1.2728-4.2783e+11.087e^2-22.348e^3+27.467e^4-18.022e^5+4.8218e^6
 
 The dimensionless coefficient of restitution :math:`e` specified as part
-of the normal contact model parameters should be between 0 and 1, but
-no error check is performed on this.
+of the normal contact model parameters should be between 0 and 1, but no
+error check is performed on this.
 
-The *coeff_restitution* model is useful when a specific normal coefficient of
-restitution :math:`e` is required. It operates much like the *Tsuji* model
-but, the normal coefficient of restitution :math:`e` is specified as an input
-in place of the usual :math:`\eta_{n0}` value in the normal model. Following
-the approach of :ref:`(Brilliantov et al) <Brill1996>`, when using the *hooke*
-normal model, *coeff_restitution* then calculates the damping coefficient as:
+The *coeff_restitution* model is useful when a specific normal
+coefficient of restitution :math:`e` is required.  It operates much like
+the *Tsuji* model but, the normal coefficient of restitution :math:`e`
+is specified as an input in place of the usual :math:`\eta_{n0}` value
+in the normal model.  Following the approach of :ref:`(Brilliantov et al)
+<Brill1996_1>`, when using the *hooke* normal model, *coeff_restitution*
+then calculates the damping coefficient as:
 
 .. math::
 
    \eta_n = \sqrt{\frac{4m_{eff}k_{nd}}{1+\left( \frac{\pi}{\log(e)}\right)^2}} ,
 
-where :math:`k_{nd}` is the same stiffness defined in the above *Tsuji* model.
-For any other normal model, e.g. the *hertz* and *hertz/material* models, the damping
-coefficient is:
+where :math:`k_{nd}` is the same stiffness defined in the above *Tsuji*
+model.  For any other normal model, e.g. the *hertz* and
+*hertz/material* models, the damping coefficient is:
 
 .. math::
 
    \eta_n = -2\sqrt{\frac{5}{6}}\frac{\log(e)}{\sqrt{\pi^2+(\log(e))^2}}\sqrt{\frac{3}{2}k_{nd} m_{eff}} ,
 
-Since *coeff_restitution* accounts for the effective mass, effective radius,
-and pairwise overlaps (except when used with the *hooke* normal model) when calculating
-the damping coefficient, it accurately reproduces the specified coefficient of
-restitution for both monodisperse and polydisperse particle pairs.  This damping
-model is not compatible with cohesive normal models such as *JKR* or *DMT*.
+Since *coeff_restitution* accounts for the effective mass, effective
+radius, and pairwise overlaps (except when used with the *hooke* normal
+model) when calculating the damping coefficient, it accurately
+reproduces the specified coefficient of restitution for both
+monodisperse and polydisperse particle pairs.  This damping model is not
+compatible with cohesive normal models such as *JKR* or *DMT*.
 
-The *mdr* damping class contains multiple damping models that can be toggled between
-by specifying different integer values for the :math:`d_{type}` input parameter. This
-damping option is only compatible with the normal *mdr* contact model.
+The *mdr* damping class contains multiple damping models that can be
+toggled between by specifying different integer values for the
+:math:`d_{type}` input parameter.  This damping option is only
+compatible with the normal *mdr* contact model.
 
-Setting :math:`d_{type} = 1` is the suggested damping option. This specifies a damping
-model that takes into account the contact stiffness :math:`k_{mdr}` calculated
-by the normal *mdr* contact model to determine the damping coefficient:
+Setting :math:`d_{type} = 1` is the suggested damping option.  This
+specifies a damping model that takes into account the contact stiffness
+:math:`k_{mdr}` calculated by the normal *mdr* contact model to
+determine the damping coefficient:
 
 .. math::
 
    \eta_n = \eta_{n0} (m_{eff}k_{mdr})^{1/2},
 
-where :math:`k_{mdr}` is proportional to contact radius :math:`a_{mdr}` tracked by the
-normal *mdr* contact model:
+where :math:`k_{mdr}` is proportional to contact radius :math:`a_{mdr}`
+tracked by the normal *mdr* contact model:
 
 .. math::
 
    k_{mdr} = 2 E_{eff} a_{mdr}.
 
-In this case, :math:`\eta_{n0}` is simply a dimensionless coefficient that scales the
-the overall damping coefficient.
+In this case, :math:`\eta_{n0}` is simply a dimensionless coefficient
+that scales the the overall damping coefficient.
 
-The other supported option is :math:`d_{type} = 2`, which defines a simple damping model
-similar to the *velocity* option
+The other supported option is :math:`d_{type} = 2`, which defines a
+simple damping model similar to the *velocity* option
 
 .. math::
 
    \eta_n = \eta_{n0},
 
-but has additional checks to avoid non-physical damping after plastic deformation.
+but has additional checks to avoid non-physical damping after plastic
+deformation.
 
-The total normal force is computed as the sum of the elastic and
-damping components:
+The total normal force is computed as the sum of the elastic and damping
+components:
 
 .. math::
 
@@ -469,8 +487,8 @@ damping components:
 ----------
 
 The *pair_coeff* command also requires specification of the tangential
-contact model. The required keyword *tangential* is expected, followed
-by the model choice and associated parameters. Currently supported
+contact model.  The required keyword *tangential* is expected, followed
+by the model choice and associated parameters.  Currently supported
 tangential model choices and their expected parameters are as follows:
 
 1. *linear_nohistory* : :math:`x_{\gamma,t}`, :math:`\mu_s`
@@ -487,13 +505,15 @@ coefficient, and :math:`k_t` is the tangential stiffness coefficient.
 
 For *tangential linear_nohistory*, a simple velocity-dependent Coulomb
 friction criterion is used, which mimics the behavior of the *pair
-gran/hooke* style. The tangential force :math:`\mathbf{F}_t` is given by:
+gran/hooke* style.  The tangential force :math:`\mathbf{F}_t` is given
+by:
 
 .. math::
 
    \mathbf{F}_t =  -\min(\mu_t F_{n0}, \|\mathbf{F}_\mathrm{t,damp}\|) \mathbf{t}
 
-The tangential damping force :math:`\mathbf{F}_\mathrm{t,damp}` is given by:
+The tangential damping force :math:`\mathbf{F}_\mathrm{t,damp}` is given
+by:
 
 .. math::
 
@@ -509,17 +529,20 @@ the normal damping :math:`\eta_n` (see above):
 The normal damping prefactor :math:`\eta_n` is determined by the choice
 of the *damping* keyword, as discussed above.  Thus, the *damping*
 keyword also affects the tangential damping.  The parameter
-:math:`x_{\gamma,t}` is a scaling coefficient. Several works in the
-literature use :math:`x_{\gamma,t} = 1` (:ref:`Marshall <Marshall2009>`,
-:ref:`Tsuji et al <Tsuji1992>`, :ref:`Silbert et al <Silbert2001>`).  The relative
-tangential velocity at the point of contact is given by
-:math:`\mathbf{v}_{t, rel} = \mathbf{v}_{t} - (R_i\boldsymbol{\Omega}_i + R_j\boldsymbol{\Omega}_j) \times \mathbf{n}`, where :math:`\mathbf{v}_{t} = \mathbf{v}_r - \mathbf{v}_r\cdot\mathbf{n}\ \mathbf{n}`,
-:math:`\mathbf{v}_r = \mathbf{v}_j - \mathbf{v}_i` .
-The direction of the applied force is :math:`\mathbf{t} = \mathbf{v_{t,rel}}/\|\mathbf{v_{t,rel}}\|` .
+:math:`x_{\gamma,t}` is a scaling coefficient.  Several works in the
+literature use :math:`x_{\gamma,t} = 1` (:ref:`Marshall
+<Marshall2009_1>`, :ref:`Tsuji et al <Tsuji1992_1>`, :ref:`Silbert et al
+<Silbert2001_1>`).  The relative tangential velocity at the point of
+contact is given by :math:`\mathbf{v}_{t, rel} = \mathbf{v}_{t} -
+(R_i\boldsymbol{\Omega}_i + R_j\boldsymbol{\Omega}_j) \times
+\mathbf{n}`, where :math:`\mathbf{v}_{t} = \mathbf{v}_r -
+\mathbf{v}_r\cdot\mathbf{n}\ \mathbf{n}`, :math:`\mathbf{v}_r =
+\mathbf{v}_j - \mathbf{v}_i` .  The direction of the applied force is
+:math:`\mathbf{t} = \mathbf{v_{t,rel}}/\|\mathbf{v_{t,rel}}\|` .
 
 The normal force value :math:`F_{n0}` used to compute the critical force
-depends on the form of the contact model. For non-cohesive models
-(\ *hertz*, *hertz/material*, *hooke*\ ), it is given by the magnitude of
+depends on the form of the contact model.  For non-cohesive models (\
+*hertz*, *hertz/material*, *hooke*\ ), it is given by the magnitude of
 the normal force:
 
 .. math::
@@ -527,10 +550,10 @@ the normal force:
    F_{n0} = \|\mathbf{F}_n\|
 
 For cohesive models such as *jkr* and *dmt*, the critical force is
-adjusted so that the critical tangential force approaches
-:math:`\mu_t F_{pulloff}`, see :ref:`Marshall <Marshall2009>`, equation 43, and
-:ref:`Thornton <Thornton1991>`.  For both models, :math:`F_{n0}` takes the
-form:
+adjusted so that the critical tangential force approaches :math:`\mu_t
+F_{pulloff}`, see :ref:`Marshall <Marshall2009_1>`, equation 43, and
+:ref:`Thornton <Thornton1991_1>`.  For both models, :math:`F_{n0}` takes
+the form:
 
 .. math::
 
@@ -542,10 +565,10 @@ Where :math:`F_{pulloff} = 3\pi \gamma R` for *jkr*, and
 The remaining tangential options all use accumulated tangential
 displacement (i.e. contact history), except for the options
 *mindlin/force* and *mindlin_rescale/force*, that use accumulated
-tangential force instead, and are discussed further below.
-The accumulated tangential displacement is discussed in details below
-in the context of the *linear_history* option. The same treatment of
-the accumulated displacement applies to the other options as well.
+tangential force instead, and are discussed further below.  The
+accumulated tangential displacement is discussed in details below in the
+context of the *linear_history* option.  The same treatment of the
+accumulated displacement applies to the other options as well.
 
 For *tangential linear_history*, the tangential force is given by:
 
@@ -560,22 +583,21 @@ during the entire duration of the contact:
 
    \mathbf{\xi} = \int_{t0}^t \mathbf{v}_{t,rel}(\tau) \mathrm{d}\tau
 
-This accumulated tangential displacement must be adjusted to account
-for changes in the frame of reference of the contacting pair of
-particles during contact. This occurs due to the overall motion of the
-contacting particles in a rigid-body-like fashion during the duration
-of the contact. There are two modes of motion that are relevant: the
+This accumulated tangential displacement must be adjusted to account for
+changes in the frame of reference of the contacting pair of particles
+during contact.  This occurs due to the overall motion of the contacting
+particles in a rigid-body-like fashion during the duration of the
+contact.  There are two modes of motion that are relevant: the
 'tumbling' rotation of the contacting pair, which changes the
 orientation of the plane in which tangential displacement occurs; and
 'spinning' rotation of the contacting pair about the vector connecting
 their centers of mass (:math:`\mathbf{n}`).  Corrections due to the
-former mode of motion are made by rotating the accumulated
-displacement into the plane that is tangential to the contact vector
-at each step, or equivalently removing any component of the tangential
-displacement that lies along :math:`\mathbf{n}`, and rescaling to
-preserve the magnitude.  This follows the discussion in
-:ref:`Luding <Luding2008>`, see equation 17 and relevant discussion in that
-work:
+former mode of motion are made by rotating the accumulated displacement
+into the plane that is tangential to the contact vector at each step, or
+equivalently removing any component of the tangential displacement that
+lies along :math:`\mathbf{n}`, and rescaling to preserve the magnitude.
+This follows the discussion in :ref:`Luding <Luding2008_1>`, see
+equation 17 and relevant discussion in that work:
 
 .. math::
 
@@ -583,14 +605,14 @@ work:
 
 Here, :math:`\mathbf{\xi'}` is the accumulated displacement prior to the
 current time step and :math:`\mathbf{\xi}` is the corrected
-displacement. Corrections to the displacement due to the second mode
-of motion described above (rotations about :math:`\mathbf{n}`) are not
+displacement.  Corrections to the displacement due to the second mode of
+motion described above (rotations about :math:`\mathbf{n}`) are not
 currently implemented, but are expected to be minor for most
 simulations.
 
 Furthermore, when the tangential force exceeds the critical force, the
-tangential displacement is re-scaled to match the value for the
-critical force (see :ref:`Luding <Luding2008>`, equation 20 and related
+tangential displacement is re-scaled to match the value for the critical
+force (see :ref:`Luding <Luding2008_1>`, equation 20 and related
 discussion):
 
 .. math::
@@ -598,7 +620,7 @@ discussion):
    \mathbf{\xi} = -\frac{1}{k_t}\left(\mu_t F_{n0}\mathbf{t} - \mathbf{F}_{t,damp}\right)
 
 The tangential force is added to the total normal force (elastic plus
-damping) to produce the total force on the particle. The tangential
+damping) to produce the total force on the particle.  The tangential
 force also acts at the contact point (defined as the center of the
 overlap region) to induce a torque on each particle according to:
 
@@ -610,42 +632,46 @@ overlap region) to induce a torque on each particle according to:
 
    \mathbf{\tau}_j = -(R_j - 0.5 \delta) \mathbf{n} \times \mathbf{F}_t
 
-For *tangential mindlin*, the :ref:`Mindlin <Mindlin1949>` no-slip solution
-is used which differs from the *linear_history* option by an additional factor
-of :math:`a`, the radius of the contact region. The tangential force is given by:
+For *tangential mindlin*, the :ref:`Mindlin <Mindlin1949>` no-slip
+solution is used which differs from the *linear_history* option by an
+additional factor of :math:`a`, the radius of the contact region.  The
+tangential force is given by:
 
 .. math::
 
    \mathbf{F}_t =  -\min(\mu_t F_{n0}, \|-k_t a \mathbf{\xi} + \mathbf{F}_\mathrm{t,damp}\|) \mathbf{t}
 
-Here, :math:`a` is the radius of the contact region, given by :math:`a =\sqrt{R\delta}`
-for all normal contact models, except for *jkr*, where it is given
-implicitly by :math:`\delta = a^2/R - 2\sqrt{\pi \gamma a/E}`, see
-discussion above. To match the Mindlin solution, one should set
-:math:`k_t = 8G_{eff}`, where :math:`G_{eff}` is the effective shear modulus given by:
+Here, :math:`a` is the radius of the contact region, given by :math:`a
+=\sqrt{R\delta}` for all normal contact models, except for *jkr*, where
+it is given implicitly by :math:`\delta = a^2/R - 2\sqrt{\pi \gamma
+a/E}`, see discussion above.  To match the Mindlin solution, one should
+set :math:`k_t = 8G_{eff}`, where :math:`G_{eff}` is the effective shear
+modulus given by:
 
 .. math::
 
    G_{eff} = \left(\frac{2-\nu_i}{G_i} + \frac{2-\nu_j}{G_j}\right)^{-1}
 
-where :math:`G_i` is the shear modulus of a particle of type :math:`i`, related to Young's
-modulus :math:`E_i` and Poisson's ratio :math:`\nu_i` by :math:`G_i = E_i/(2(1+\nu_i))`.
-This can also be achieved by specifying *NULL* for :math:`k_t`, in which case a
-normal contact model that specifies material parameters :math:`E_i` and
-:math:`\nu_i` is required (e.g. *hertz/material*, *dmt* or *jkr*\ ). In this
-case, mixing of the shear modulus for different particle types *i* and
-*j* is done according to the formula above.
+where :math:`G_i` is the shear modulus of a particle of type :math:`i`,
+related to Young's modulus :math:`E_i` and Poisson's ratio :math:`\nu_i`
+by :math:`G_i = E_i/(2(1+\nu_i))`.  This can also be achieved by
+specifying *NULL* for :math:`k_t`, in which case a normal contact model
+that specifies material parameters :math:`E_i` and :math:`\nu_i` is
+required (e.g. *hertz/material*, *dmt* or *jkr*\ ).  In this case,
+mixing of the shear modulus for different particle types *i* and *j* is
+done according to the formula above.
 
 .. note::
 
-   The radius of the contact region :math:`a` depends on the normal overlap.
-   As a result, the tangential force for *mindlin* can change due to
-   a variation in normal overlap, even with no change in tangential displacement.
+   The radius of the contact region :math:`a` depends on the normal
+   overlap.  As a result, the tangential force for *mindlin* can change
+   due to a variation in normal overlap, even with no change in
+   tangential displacement.
 
 For *tangential mindlin/force*, the accumulated elastic tangential force
 characterizes the contact history, instead of the accumulated tangential
-displacement. This prevents the dependence of the tangential force on the
-normal overlap as noted above. The tangential force is given by:
+displacement.  This prevents the dependence of the tangential force on
+the normal overlap as noted above.  The tangential force is given by:
 
 .. math::
 
@@ -658,19 +684,20 @@ The increment of the elastic component of the tangential force
 
    \mathrm{d}\mathbf{F}_{te} = -k_t a \mathbf{v}_{t,rel} \mathrm{d}\tau
 
-The changes in frame of reference of the contacting pair of particles during
-contact are accounted for by the same formula as above, replacing the
-accumulated tangential displacement :math:`\xi`, by the accumulated tangential
-elastic force :math:`F_{te}`. When the tangential force exceeds the critical
-force, the tangential force is directly re-scaled to match the value for
-the critical force:
+The changes in frame of reference of the contacting pair of particles
+during contact are accounted for by the same formula as above, replacing
+the accumulated tangential displacement :math:`\xi`, by the accumulated
+tangential elastic force :math:`F_{te}`.  When the tangential force
+exceeds the critical force, the tangential force is directly re-scaled
+to match the value for the critical force:
 
 .. math::
 
    \mathbf{F}_{te} = - \mu_t F_{n0}\mathbf{t} + \mathbf{F}_{t,damp}
 
-The same rules as those described for *mindlin* apply regarding the tangential
-stiffness and mixing of the shear modulus for different particle types.
+The same rules as those described for *mindlin* apply regarding the
+tangential stiffness and mixing of the shear modulus for different
+particle types.
 
 The *mindlin_rescale* option uses the same form as *mindlin*, but the
 magnitude of the tangential displacement is re-scaled as the contact
@@ -680,64 +707,69 @@ unloads, i.e. if :math:`a < a_{t_{n-1}}`:
 
    \mathbf{\xi} = \mathbf{\xi_{t_{n-1}}} \frac{a}{a_{t_{n-1}}}
 
-Here, :math:`t_{n-1}` indicates the value at the previous time
-step. This rescaling accounts for the fact that a decrease in the
-contact area upon unloading leads to the contact being unable to
-support the previous tangential loading, and spurious energy is
-created without the rescaling above (:ref:`Walton <WaltonPC>` ).
+Here, :math:`t_{n-1}` indicates the value at the previous time step.
+This rescaling accounts for the fact that a decrease in the contact area
+upon unloading leads to the contact being unable to support the previous
+tangential loading, and spurious energy is created without the rescaling
+above (:ref:`Walton <WaltonPC_1>` ).
 
 .. note::
 
-   For *mindlin*, a decrease in the tangential force already occurs as the
-   contact unloads, due to the dependence of the tangential force on the normal
-   force described above. By re-scaling :math:`\xi`, *mindlin_rescale*
-   effectively re-scales the tangential force twice, i.e., proportionally to
-   :math:`a^2`. This peculiar behavior results from use of the accumulated
-   tangential displacement to characterize the contact history. Although
-   *mindlin_rescale* remains available for historic reasons and backward
-   compatibility purposes, it should be avoided in favor of *mindlin_rescale/force*.
-
-The *mindlin_rescale/force* option uses the same form as *mindlin/force*,
-but the magnitude of the tangential elastic force is re-scaled as the contact
-unloads, i.e. if :math:`a < a_{t_{n-1}}`:
+   For *mindlin*, a decrease in the tangential force already occurs as
+   the contact unloads, due to the dependence of the tangential force on
+   the normal force described above.  By re-scaling :math:`\xi`,
+   *mindlin_rescale* effectively re-scales the tangential force twice,
+   i.e., proportionally to :math:`a^2`.  This peculiar behavior results
+   from use of the accumulated tangential displacement to characterize
+   the contact history.  Although *mindlin_rescale* remains available
+   for historic reasons and backward compatibility purposes, it should
+   be avoided in favor of *mindlin_rescale/force*.
+
+The *mindlin_rescale/force* option uses the same form as
+*mindlin/force*, but the magnitude of the tangential elastic force is
+re-scaled as the contact unloads, i.e. if :math:`a < a_{t_{n-1}}`:
 
 .. math::
 
    \mathbf{F}_{te} = \mathbf{F}_{te, t_{n-1}} \frac{a}{a_{t_{n-1}}}
 
-This approach provides a better approximation of the :ref:`Mindlin-Deresiewicz <Mindlin1953>`
-laws and is more consistent than *mindlin_rescale*. See discussions in
-:ref:`Thornton et al, 2013 <Thornton2013>`, particularly equation 18(b) of that
-work and associated discussion, and :ref:`Agnolin and Roux, 2007 <AgnolinRoux2007>`,
-particularly Appendix A.
+This approach provides a better approximation of the
+:ref:`Mindlin-Deresiewicz <Mindlin1953>` laws and is more consistent
+than *mindlin_rescale*.  See discussions in :ref:`Thornton et al, 2013
+<Thornton2013_1>`, particularly equation 18(b) of that work and
+associated discussion, and :ref:`Agnolin and Roux, 2007
+<AgnolinRoux2007>`, particularly Appendix A.
 
 ----------
 
 The optional *rolling* keyword enables rolling friction, which resists
-pure rolling motion of particles. The options currently supported are:
+pure rolling motion of particles.  The options currently supported are:
 
 1. *none*
 2. *sds* : :math:`k_{roll}`, :math:`\gamma_{roll}`, :math:`\mu_{roll}`
 
-If the *rolling* keyword is not specified, the model defaults to *none*\ .
+If the *rolling* keyword is not specified, the model defaults to *none*\
+.
 
 For *rolling sds*, rolling friction is computed via a
 spring-dashpot-slider, using a 'pseudo-force' formulation, as detailed
-by :ref:`Luding <Luding2008>`. Unlike the formulation in
-:ref:`Marshall <Marshall2009>`, this allows for the required adjustment of
-rolling displacement due to changes in the frame of reference of the
-contacting pair.  The rolling pseudo-force is computed analogously to
-the tangential force:
+by :ref:`Luding <Luding2008_1>`.  Unlike the formulation in
+:ref:`Marshall <Marshall2009_1>`, this allows for the required
+adjustment of rolling displacement due to changes in the frame of
+reference of the contacting pair.  The rolling pseudo-force is computed
+analogously to the tangential force:
 
 .. math::
 
    \mathbf{F}_{roll,0} =  k_{roll} \mathbf{\xi}_{roll}  - \gamma_{roll} \mathbf{v}_{roll}
 
-Here, :math:`\mathbf{v}_{roll} = -R(\boldsymbol{\Omega}_i - \boldsymbol{\Omega}_j)
-\times \mathbf{n}` is the relative rolling velocity, as given in
-:ref:`Wang et al <Wang2015>` and :ref:`Luding <Luding2008>`. This differs from the
-expressions given by :ref:`Kuhn and Bagi <Kuhn2004>` and used in :ref:`Marshall <Marshall2009>`;
-see :ref:`Wang et al <Wang2015>` for details. The rolling displacement is given by:
+Here, :math:`\mathbf{v}_{roll} = -R(\boldsymbol{\Omega}_i -
+\boldsymbol{\Omega}_j) \times \mathbf{n}` is the relative rolling
+velocity, as given in :ref:`Wang et al <Wang2015>` and :ref:`Luding
+<Luding2008_1>`.  This differs from the expressions given by :ref:`Kuhn
+and Bagi <Kuhn2004>` and used in :ref:`Marshall <Marshall2009_1>`; see
+:ref:`Wang et al <Wang2015>` for details.  The rolling displacement is
+given by:
 
 .. math::
 
@@ -750,11 +782,11 @@ exceeds a critical value:
 
    \mathbf{F}_{roll} =  \min(\mu_{roll} F_{n,0}, \|\mathbf{F}_{roll,0}\|)\mathbf{k}
 
-Here, :math:`\mathbf{k} = \mathbf{v}_{roll}/\|\mathbf{v}_{roll}\|` is the direction of
-the pseudo-force.  As with tangential displacement, the rolling
-displacement is rescaled when the critical force is exceeded, so that
-the spring length corresponds the critical force. Additionally, the
-displacement is adjusted to account for rotations of the frame of
+Here, :math:`\mathbf{k} = \mathbf{v}_{roll}/\|\mathbf{v}_{roll}\|` is
+the direction of the pseudo-force.  As with tangential displacement, the
+rolling displacement is rescaled when the critical force is exceeded, so
+that the spring length corresponds the critical force.  Additionally,
+the displacement is adjusted to account for rotations of the frame of
 reference of the two contacting particles in a manner analogous to the
 tangential displacement.
 
@@ -772,40 +804,41 @@ opposite torque on each particle, according to:
 
 ----------
 
-The optional *twisting* keyword enables twisting friction, which
-resists rotation of two contacting particles about the vector
-:math:`\mathbf{n}` that connects their centers. The options currently
-supported are:
+The optional *twisting* keyword enables twisting friction, which resists
+rotation of two contacting particles about the vector :math:`\mathbf{n}`
+that connects their centers.  The options currently supported are:
 
 1. *none*
 2. *sds* : :math:`k_{twist}`, :math:`\gamma_{twist}`, :math:`\mu_{twist}`
 3. *marshall*
 
-If the *twisting* keyword is not specified, the model defaults to *none*\ .
+If the *twisting* keyword is not specified, the model defaults to
+*none*\ .
 
 For both *twisting sds* and *twisting marshall*, a history-dependent
-spring-dashpot-slider is used to compute the twisting torque. Because
+spring-dashpot-slider is used to compute the twisting torque.  Because
 twisting displacement is a scalar, there is no need to adjust for
-changes in the frame of reference due to rotations of the particle
-pair. The formulation in :ref:`Marshall <Marshall2009>` therefore provides
+changes in the frame of reference due to rotations of the particle pair.
+The formulation in :ref:`Marshall <Marshall2009_1>` therefore provides
 the most straightforward treatment:
 
 .. math::
 
    \tau_{twist,0} = -k_{twist}\xi_{twist} - \gamma_{twist}\Omega_{twist}
 
-Here :math:`\xi_{twist} = \int_{t_0}^t \Omega_{twist} (\tau) \mathrm{d}\tau` is
-the twisting angular displacement, and
-:math:`\Omega_{twist} = (\mathbf{\Omega}_i - \mathbf{\Omega}_j) \cdot \mathbf{n}`
-is the relative twisting angular velocity. The torque is then truncated according to:
+Here :math:`\xi_{twist} = \int_{t_0}^t \Omega_{twist} (\tau)
+\mathrm{d}\tau` is the twisting angular displacement, and
+:math:`\Omega_{twist} = (\mathbf{\Omega}_i - \mathbf{\Omega}_j) \cdot
+\mathbf{n}` is the relative twisting angular velocity.  The torque is
+then truncated according to:
 
 .. math::
 
    \tau_{twist} = \min(\mu_{twist} F_{n,0}, \tau_{twist,0})
 
 Similar to the sliding and rolling displacement, the angular
-displacement is rescaled so that it corresponds to the critical value
-if the twisting torque exceeds this critical value:
+displacement is rescaled so that it corresponds to the critical value if
+the twisting torque exceeds this critical value:
 
 .. math::
 
@@ -816,8 +849,8 @@ and :math:`\mu_{twist}` are simply the user input parameters that follow
 the *twisting sds* keywords in the *pair_coeff* command.
 
 For *twisting_marshall*, the coefficients are expressed in terms of
-sliding friction coefficients, as discussed in
-:ref:`Marshall <Marshall2009>` (see equations 32 and 33 of that work):
+sliding friction coefficients, as discussed in :ref:`Marshall
+<Marshall2009_1>` (see equations 32 and 33 of that work):
 
 .. math::
 
@@ -844,104 +877,104 @@ Finally, the twisting torque on each particle is given by:
 ----------
 
 If two particles are moving away from each other while in contact, there
-is a possibility that the particles could experience an effective attractive
-force due to damping. If the optional *limit_damping* keyword is used, this option
-will zero out the normal component of the force if there is an effective
-attractive force. This keyword cannot be used with the JKR or DMT models.
+is a possibility that the particles could experience an effective
+attractive force due to damping.  If the optional *limit_damping*
+keyword is used, this option will zero out the normal component of the
+force if there is an effective attractive force.  This keyword cannot be
+used with the JKR or DMT models.
 
 ----------
 
-The standard velocity-Verlet integration scheme's half-step staggering of
-position and velocity can introduce inaccuracies in frictional tangential
-force calculations, resulting in unphysical kinematics in certain systems.
-These effects are particularly pronounced in polydisperse frictional flows
-characterized by large-to-small size ratios exceeding three. The
-*synchronized_verlet* flag implements an alternate Velocity-Verlet integration
-scheme, as detailed in :ref:`Vyas et al <Vyas2025>`, that synchronizes position
-and velocity updates for force evaluation. By refining tangential force
-calculations, the *synchronized_verlet* method ensures physically consistent
-results without significantly impacting computational cost.
+The standard velocity-Verlet integration scheme's half-step staggering
+of position and velocity can introduce inaccuracies in frictional
+tangential force calculations, resulting in unphysical kinematics in
+certain systems.  These effects are particularly pronounced in
+polydisperse frictional flows characterized by large-to-small size
+ratios exceeding three.  The *synchronized_verlet* flag implements an
+alternate Velocity-Verlet integration scheme, as detailed in :ref:`Vyas
+et al <Vyas2025>`, that synchronizes position and velocity updates for
+force evaluation.  By refining tangential force calculations, the
+*synchronized_verlet* method ensures physically consistent results
+without significantly impacting computational cost.
 
 ----------
 
-The optional *heat* keyword enables heat conduction. The options currently
-supported are:
+The optional *heat* keyword enables heat conduction.  The options
+currently supported are:
 
 1. *none*
 2. *radius* : :math:`k_{s}`
 3. *area* : :math:`h_{s}`
 
-If the *heat* keyword is not specified, the model defaults to *none*. All
-heat models calculate an additional pairwise quantity accessible by the
-single() function (described below) which is the heat conducted between the
-two particles.
+If the *heat* keyword is not specified, the model defaults to
+*none*.  All heat models calculate an additional pairwise quantity
+accessible by the single() function (described below) which is the heat
+conducted between the two particles.
 
-For *heat* *radius*, the heat
-:math:`Q` conducted between two particles is given by
+For *heat* *radius*, the heat :math:`Q` conducted between two particles
+is given by
 
 .. math::
 
    Q = 2 k_{s} a \Delta T
 
-where :math:`\Delta T` is the difference in the two particles' temperature,
-:math:`k_{s}` is a non-negative numeric value for the conductivity (in units
-of power/(length*temperature)), and :math:`a` is the radius of the contact and
-depends on the normal force model. This is the model proposed by
-:ref:`Vargas and McCarthy <VargasMcCarthy2001>`.
+where :math:`\Delta T` is the difference in the two particles'
+temperature, :math:`k_{s}` is a non-negative numeric value for the
+conductivity (in units of power/(length*temperature)), and :math:`a` is
+the radius of the contact and depends on the normal force model.  This
+is the model proposed by :ref:`Vargas and McCarthy
+<VargasMcCarthy2001>`.
 
-For *heat* *area*, the heat
-:math:`Q` conducted between two particles is given by
+For *heat* *area*, the heat :math:`Q` conducted between two particles is
+given by
 
 .. math::
 
    Q = h_{s} A \Delta T
 
-
-where :math:`\Delta T` is the difference in the two particles' temperature,
-:math:`h_{s}` is a non-negative numeric value for the heat transfer
-coefficient (in units of power/(area*temperature)), and :math:`A=\pi a^2` is
-the area of the contact and depends on the normal force model.
+where :math:`\Delta T` is the difference in the two particles'
+temperature, :math:`h_{s}` is a non-negative numeric value for the heat
+transfer coefficient (in units of power/(area*temperature)), and
+:math:`A=\pi a^2` is the area of the contact and depends on the normal
+force model.
 
 Note that the option *none* must either be used in all or none of of the
-*pair_coeff* calls. See :doc:`fix heat/flow <fix_heat_flow>` and
-:doc:`fix property/atom <fix_property_atom>` for more information on this
-option.
+*pair_coeff* calls.  See :doc:`fix heat/flow <fix_heat_flow>` and
+:doc:`fix property/atom <fix_property_atom>` for more information on
+this option.
 
 ----------
 
-The *granular* pair style can reproduce the behavior of the
-*pair gran/\** styles with the appropriate settings (some very
-minor differences can be expected due to corrections in
-displacement history frame-of-reference, and the application
-of the torque at the center of the contact rather than
-at each particle). The first example above
-is equivalent to *pair gran/hooke 1000.0 NULL 50.0 50.0 0.4 1*\ .
-The second example is equivalent to
-*pair gran/hooke/history 1000.0 500.0 50.0 50.0 0.4 1*\ .
-The third example is equivalent to
-*pair gran/hertz/history 1000.0 500.0 50.0 50.0 0.4 1 limit_damping*\ .
+The *granular* pair style can reproduce the behavior of the *pair
+gran/\** styles with the appropriate settings (some very minor
+differences can be expected due to corrections in displacement history
+frame-of-reference, and the application of the torque at the center of
+the contact rather than at each particle).  The first example above is
+equivalent to *pair gran/hooke 1000.0 NULL 50.0 50.0 0.4 1*\ .  The
+second example is equivalent to *pair gran/hooke/history 1000.0 500.0
+50.0 50.0 0.4 1*\ .  The third example is equivalent to *pair
+gran/hertz/history 1000.0 500.0 50.0 50.0 0.4 1 limit_damping*\ .
 
 ----------
 
 LAMMPS automatically sets pairwise cutoff values for *pair_style
 granular* based on particle radii (and in the case of *jkr* pull-off
-distances). In the vast majority of situations, this is adequate.
+distances).  In the vast majority of situations, this is adequate.
 However, a cutoff value can optionally be appended to the *pair_style
-granular* command to specify a global cutoff (i.e. a cutoff for all
-atom types). Additionally, the optional *cutoff* keyword can be passed
-to the *pair_coeff* command, followed by a cutoff value.  This will
-set a pairwise cutoff for the atom types in the *pair_coeff* command.
-These options may be useful in some rare cases where the automatic
-cutoff determination is not sufficient, e.g.  if particle diameters
-are being modified via the *fix adapt* command. In that case, the
-global cutoff specified as part of the *pair_style granular* command
-is applied to all atom types, unless it is overridden for a given atom
-type combination by the *cutoff* value specified in the *pair coeff*
-command.  If *cutoff* is only specified in the *pair coeff* command
-and no global cutoff is appended to the *pair_style granular* command,
-then LAMMPS will use that cutoff for the specified atom type
-combination, and automatically set pairwise cutoffs for the remaining
-atom types.
+granular* command to specify a global cutoff (i.e. a cutoff for all atom
+types).  Additionally, the optional *cutoff* keyword can be passed to
+the *pair_coeff* command, followed by a cutoff value.  This will set a
+pairwise cutoff for the atom types in the *pair_coeff* command.  These
+options may be useful in some rare cases where the automatic cutoff
+determination is not sufficient, e.g.  if particle diameters are being
+modified via the *fix adapt* command.  In that case, the global cutoff
+specified as part of the *pair_style granular* command is applied to all
+atom types, unless it is overridden for a given atom type combination by
+the *cutoff* value specified in the *pair coeff* command.  If *cutoff*
+is only specified in the *pair coeff* command and no global cutoff is
+appended to the *pair_style granular* command, then LAMMPS will use that
+cutoff for the specified atom type combination, and automatically set
+pairwise cutoffs for the remaining atom types.
 
 ----------
 
@@ -949,17 +982,17 @@ Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
 The :doc:`pair_modify <pair_modify>` mix, shift, table, and tail options
-are not relevant for granular pair styles.
+are not relevant for any granular pair styles.
 
-Mixing of coefficients is carried out using geometric averaging for
-most quantities, e.g. if friction coefficient for type 1-type 1
-interactions is set to :math:`\mu_1`, and friction coefficient for type
-2-type 2 interactions is set to :math:`\mu_2`, the friction coefficient
-for type1-type2 interactions is computed as :math:`\sqrt{\mu_1\mu_2}`
+Mixing of coefficients is carried out using geometric averaging for most
+quantities, e.g. if friction coefficient for type 1-type 1 interactions
+is set to :math:`\mu_1`, and friction coefficient for type 2-type 2
+interactions is set to :math:`\mu_2`, the friction coefficient for
+type1-type2 interactions is computed as :math:`\sqrt{\mu_1\mu_2}`
 (unless explicitly specified to a different value by a *pair_coeff 1 2
-...* command). The exception to this is elastic modulus, only
-applicable to *hertz/material*, *dmt* and *jkr* normal contact
-models. In that case, the effective elastic modulus is computed as:
+...* command).  The exception to this is elastic modulus, only
+applicable to *hertz/material*, *dmt* and *jkr* normal contact models.
+In that case, the effective elastic modulus is computed as:
 
 .. math::
 
@@ -978,37 +1011,36 @@ or
 
    E_{eff,ij} = \frac{E_{ij}}{2(1-\nu_{ij}^2)}
 
-These pair styles write their information to :doc:`binary restart files <restart>`,
-so a pair_style command does not need to be specified in an input script that reads
-a restart file.
+This pair style writes its information to :doc:`binary restart files
+<restart>`, so a pair_style command does not need to be specified in an
+input script that reads a restart file.
 
-These pair styles can only be used via the *pair* keyword of the
-:doc:`run_style respa <run_style>` command.  They do not support the
+This pair style can only be used via the *pair* keyword of the
+:doc:`run_style respa <run_style>` command.  It does not support the
 *inner*, *middle*, *outer* keywords.
 
-The single() function of these pair styles returns 0.0 for the energy
-of a pairwise interaction, since energy is not conserved in these
-dissipative potentials.  It also returns only the normal component of
-the pairwise interaction force.  However, the single() function also
-calculates at least 13 extra pairwise quantities.  The first 3 are the
-components of the tangential force between particles I and J, acting
-on particle I.  The fourth is the magnitude of this tangential force.
-The next 3 (5-7) are the components of the rolling torque acting on
-particle I. The next entry (8) is the magnitude of the rolling torque.
-The next entry (9) is the magnitude of the twisting torque acting
-about the vector connecting the two particle centers.
-The next 3 (10-12) are the components of the vector connecting
-the centers of the two particles (x_I - x_J). If a granular sub-model
-calculates additional contact information (e.g. the heat sub-models
-calculate the amount of heat exchanged), these quantities are appended
-to the end of this list. First, any extra values from the normal sub-model
-are appended followed by the damping, tangential, rolling, twisting, then
-heat models. See the descriptions of specific granular sub-models above
-for information on any extra quantities. If two or more models are
-defined by pair coefficients, the size of the array is set by the
-maximum number of extra quantities in a model but the order of quantities
-is determined by each model's specific set of sub-models. Any unused
-quantities are zeroed.
+The single() function of this pair style returns 0.0 for the energy of a
+pairwise interaction, since energy is not conserved in these dissipative
+potentials.  It also returns only the normal component of the pairwise
+interaction force.  However, the single() function also calculates at
+least 13 extra pairwise quantities.  The first 3 are the components of
+the tangential force between particles I and J, acting on particle I.
+The fourth is the magnitude of this tangential force.  The next 3 (5-7)
+are the components of the rolling torque acting on particle I.  The next
+entry (8) is the magnitude of the rolling torque.  The next entry (9) is
+the magnitude of the twisting torque acting about the vector connecting
+the two particle centers.  The next 3 (10-12) are the components of the
+vector connecting the centers of the two particles (x_I - x_J).  If a
+granular sub-model calculates additional contact information (e.g. the
+heat sub-models calculate the amount of heat exchanged), these
+quantities are appended to the end of this list.  First, any extra
+values from the normal sub-model are appended followed by the damping,
+tangential, rolling, twisting, then heat models.  See the descriptions
+of specific granular sub-models above for information on any extra
+quantities.  If two or more models are defined by pair coefficients, the
+size of the array is set by the maximum number of extra quantities in a
+model but the order of quantities is determined by each model's specific
+set of sub-models.  Any unused quantities are zeroed.
 
 These extra quantities can be accessed by the :doc:`compute pair/local
 <compute_pair_local>` command, as *p1*, *p2*, ..., *p12*\ .
@@ -1018,30 +1050,29 @@ These extra quantities can be accessed by the :doc:`compute pair/local
 Restrictions
 """"""""""""
 
-This pair style is part of the GRANULAR package.  It is
-only enabled if LAMMPS was built with that package.
-See the :doc:`Build package <Build_package>` page for more info.
+This pair style is part of the GRANULAR package.  It is only enabled if
+LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
 
-This pair style requires that atoms store per-particle radius,
-torque, and angular velocity (omega) as defined by the
-:doc:`atom_style sphere <atom_style>`.
+This pair style requires that atoms store per-particle radius, torque,
+and angular velocity (omega) as defined by the :doc:`atom_style sphere
+<atom_style>`.
 
-This pair style requires you to use the :doc:`comm_modify vel yes <comm_modify>`
-command so that velocities are stored by ghost atoms.
+This pair style requires you to use the :doc:`comm_modify vel yes
+<comm_modify>` command so that velocities are stored by ghost atoms.
 
 This pair style will not restart exactly when using the
 :doc:`read_restart <read_restart>` command, though it should provide
-statistically similar results.  This is because the forces it
-computes depend on atom velocities and the atom velocities have
-been propagated half a timestep between the force computation and
-when the restart is written, due to using Velocity Verlet time
-integration. See the :doc:`read_restart <read_restart>` command
-for more details.
-
-Accumulated values for individual contacts are saved to restart
-files but are not saved to data files. Therefore, forces may
-differ significantly when a system is reloaded using the
-:doc:`read_data <read_data>` command.
+statistically similar results.  This is because the forces it computes
+depend on atom velocities and the atom velocities have been propagated
+half a timestep between the force computation and when the restart is
+written, due to using Velocity Verlet time integration.  See the
+:doc:`read_restart <read_restart>` command for more details.
+
+Accumulated values for individual contacts are saved to restart files
+but are not saved to data files.  Therefore, forces may differ
+significantly when a system is reloaded using the :doc:`read_data
+<read_data>` command.
 
 Related commands
 """"""""""""""""
@@ -1058,13 +1089,13 @@ For the *pair_coeff* settings: *damping viscoelastic*, *rolling none*,
 References
 """"""""""
 
-.. _Brill1996:
+.. _Brill1996_1:
 
 **(Brilliantov et al, 1996)** Brilliantov, N. V., Spahn, F., Hertzsch,
 J. M., & Poschel, T. (1996).  Model for collisions in granular
 gases. Physical review E, 53(5), 5382.
 
-.. _Tsuji1992:
+.. _Tsuji1992_1:
 
 **(Tsuji et al, 1992)** Tsuji, Y., Tanaka, T., & Ishida,
 T. (1992). Lagrangian numerical simulation of plug flow of
@@ -1105,18 +1136,18 @@ Tang, P., & Kamrin, K. (2025). Experimentally validated DEM for large
 deformation powder compaction: Mechanically-derived contact model and
 screening of non-physical contacts. Powder Technology, 120972.
 
-.. _Luding2008:
+.. _Luding2008_1:
 
 **(Luding, 2008)** Luding, S. (2008). Cohesive, frictional powders:
 contact models for tension. Granular matter, 10(4), 235.
 
-.. _Marshall2009:
+.. _Marshall2009_1:
 
 **(Marshall, 2009)** Marshall, J. S. (2009). Discrete-element modeling
 of particulate aerosol flows.  Journal of Computational Physics,
 228(5), 1541-1561.
 
-.. _Silbert2001:
+.. _Silbert2001_1:
 
 **(Silbert, 2001)** Silbert, L. E., Ertas, D., Grest, G. S., Halsey,
 T. C., Levine, D., & Plimpton, S. J. (2001).  Granular flow down an
@@ -1135,7 +1166,7 @@ solids and structures, 41(21), 5793-5820.
 W. W. (2015).  Rolling and sliding in 3-D discrete element
 models. Particuology, 23, 49-55.
 
-.. _Thornton1991:
+.. _Thornton1991_1:
 
 **(Thornton, 1991)** Thornton, C. (1991). Interparticle sliding in the
 presence of adhesion.  J. Phys. D: Appl. Phys. 24 1942
@@ -1145,14 +1176,14 @@ presence of adhesion.  J. Phys. D: Appl. Phys. 24 1942
 **(Mindlin, 1949)** Mindlin, R. D. (1949). Compliance of elastic bodies
 in contact.  J. Appl. Mech., ASME 16, 259-268.
 
-.. _Thornton2013:
+.. _Thornton2013_1:
 
 **(Thornton et al, 2013)** Thornton, C., Cummins, S. J., & Cleary,
 P. W. (2013).  An investigation of the comparative behavior of
 alternative contact force models during inelastic collisions. Powder
 Technology, 233, 30-46.
 
-.. _WaltonPC:
+.. _WaltonPC_1:
 
 **(Otis R. Walton)** Walton, O.R., Personal Communication
 
diff --git a/doc/src/pair_granular_superellipsoid.rst b/doc/src/pair_granular_superellipsoid.rst
index 3bff828c0d1..9c4d63f0611 100644
--- a/doc/src/pair_granular_superellipsoid.rst
+++ b/doc/src/pair_granular_superellipsoid.rst
@@ -286,7 +286,7 @@ m_j/(m_i + m_j)` is the effective mass.  Use *damping mass_velocity* to
 reproduce the damping behavior of *pair gran/hooke/\**.
 
 The *damping viscoelastic* model is based on the viscoelastic treatment
-of :ref:`(Brilliantov et al) <Brill1996>`, where the normal damping is
+of :ref:`(Brilliantov et al) <Brill1996_2>`, where the normal damping is
 given by:
 
 .. math::
@@ -336,8 +336,8 @@ The normal damping prefactor :math:`\eta_n` is determined by the choice
 of the *damping* keyword, as discussed above.  Thus, the *damping*
 keyword also affects the tangential damping.  The parameter
 :math:`x_{\gamma,t}` is a scaling coefficient.  Several works in the
-literature use :math:`x_{\gamma,t} = 1` (:ref:`Marshall <Marshall2009>`,
-:ref:`Tsuji et al <Tsuji1992>`, :ref:`Silbert et al <Silbert2001>`).
+literature use :math:`x_{\gamma,t} = 1` (:ref:`Marshall <Marshall2009_2>`,
+:ref:`Tsuji et al <Tsuji1992_2>`, :ref:`Silbert et al <Silbert2001_2>`).
 The relative tangential velocity at the point of contact is given by
 :math:`\mathbf{v}_{t, rel} = \mathbf{v}_{t} -
 (R_i\boldsymbol{\Omega}_i + R_j\boldsymbol{\Omega}_j) \times
@@ -386,7 +386,7 @@ former mode of motion are made by rotating the accumulated displacement
 into the plane that is tangential to the contact vector at each step, or
 equivalently removing any component of the tangential displacement that
 lies along :math:`\mathbf{n}`, and rescaling to preserve the magnitude.
-This follows the discussion in :ref:`Luding <Luding2008>`, see equation
+This follows the discussion in :ref:`Luding <Luding2008_2>`, see equation
 17 and relevant discussion in that work:
 
 .. math::
@@ -402,7 +402,7 @@ simulations.
 
 Furthermore, when the tangential force exceeds the critical force, the
 tangential displacement is re-scaled to match the value for the critical
-force (see :ref:`Luding <Luding2008>`, equation 20 and related
+force (see :ref:`Luding <Luding2008_2>`, equation 20 and related
 discussion):
 
 .. math::
@@ -531,24 +531,31 @@ For the *pair_coeff* settings: *damping viscoelastic*
 References
 """"""""""
 
-.. _Brill1996:
+.. _Brill1996_2:
 
 **(Brilliantov et al, 1996)** Brilliantov, N. V., Spahn, F., Hertzsch,
 J. M., & Poschel, T. (1996).  Model for collisions in granular
 gases. Physical review E, 53(5), 5382.
 
-.. _Luding2008:
+.. _Tsuji1992_2:
+
+**(Tsuji et al, 1992)** Tsuji, Y., Tanaka, T., & Ishida,
+T. (1992). Lagrangian numerical simulation of plug flow of
+cohesionless particles in a horizontal pipe. Powder technology, 71(3),
+239-250.
+
+.. _Luding2008_2:
 
 **(Luding, 2008)** Luding, S. (2008). Cohesive, frictional powders:
 contact models for tension. Granular matter, 10(4), 235.
 
-.. _Marshall2009:
+.. _Marshall2009_2:
 
 **(Marshall, 2009)** Marshall, J. S. (2009). Discrete-element modeling
 of particulate aerosol flows.  Journal of Computational Physics,
 228(5), 1541-1561.
 
-.. _Silbert2001:
+.. _Silbert2001_2:
 
 **(Silbert, 2001)** Silbert, L. E., Ertas, D., Grest, G. S., Halsey,
 T. C., Levine, D., & Plimpton, S. J. (2001).  Granular flow down an
@@ -556,19 +563,19 @@ inclined plane: Bagnold scaling and rheology. Physical Review E,
 64(5), 051302.
 
 
-.. _Thornton1991:
+.. _Thornton1991_2:
 
 **(Thornton, 1991)** Thornton, C. (1991). Interparticle sliding in the
 presence of adhesion.  J. Phys. D: Appl. Phys. 24 1942
 
-.. _Thornton2013:
+.. _Thornton2013_2:
 
 **(Thornton et al, 2013)** Thornton, C., Cummins, S. J., & Cleary,
 P. W. (2013).  An investigation of the comparative behavior of
 alternative contact force models during inelastic collisions. Powder
 Technology, 233, 30-46.
 
-.. _WaltonPC:
+.. _WaltonPC_2:
 
 **(Otis R. Walton)** Walton, O.R., Personal Communication
 
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 32e7d435a89..d8ed1a1e258 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -1767,6 +1767,7 @@ isenthalpic
 ish
 Ishida
 iso
+isochoric
 isodemic
 isoenergetic
 isoenthalpic
diff --git a/examples/ASPHERE/superellipsoid_gran/in.super_table b/examples/ASPHERE/superellipsoid_gran/in.super_table
new file mode 100644
index 00000000000..18d887aec09
--- /dev/null
+++ b/examples/ASPHERE/superellipsoid_gran/in.super_table
@@ -0,0 +1,69 @@
+units           si
+atom_style      ellipsoid superellipsoid
+dimension       3
+boundary p p p
+comm_modify vel yes
+newton off
+
+# create display of superellipsoidal particle shapes
+
+region          box block 0 10 0 10 0 1
+create_box      25 box
+create_atoms 1 single 1 1 0.5
+create_atoms 2 single 3 1 0.5
+create_atoms 3 single 5 1 0.5
+create_atoms 4 single 7 1 0.5
+create_atoms 5 single 9 1 0.5
+create_atoms 6 single 1 3 0.5
+create_atoms 7 single 3 3 0.5
+create_atoms 8 single 5 3 0.5
+create_atoms 9 single 7 3 0.5
+create_atoms 10 single 9 3 0.5
+create_atoms 11 single 1 5 0.5
+create_atoms 12 single 3 5 0.5
+create_atoms 13 single 5 5 0.5
+create_atoms 14 single 7 5 0.5
+create_atoms 15 single 9 5 0.5
+create_atoms 16 single 1 7 0.5
+create_atoms 17 single 3 7 0.5
+create_atoms 18 single 5 7 0.5
+create_atoms 19 single 7 7 0.5
+create_atoms 20 single 9 7 0.5
+create_atoms 21 single 1 9 0.5
+create_atoms 22 single 3 9 0.5
+create_atoms 23 single 5 9 0.5
+create_atoms 24 single 7 9 0.5
+create_atoms 25 single 9 9 0.5
+
+set type * mass 1.0
+set type * shape 1.0 1.2 1.5
+set type 1 block 2.0 2.0
+set type 2 block 3.0 2.0
+set type 3 block 5.0 2.0
+set type 4 block 8.0 2.0
+set type 5 block 16.0 2.0
+set type 6 block 2.0 3.0
+set type 7 block 3.0 3.0
+set type 8 block 5.0 3.0
+set type 9 block 8.0 3.0
+set type 10 block 16.0 3.0
+set type 11 block 2.0 5.0
+set type 12 block 3.0 5.0
+set type 13 block 5.0 5.0
+set type 14 block 8.0 5.0
+set type 15 block 16.0 5.0
+set type 16 block 2.0 8.0
+set type 17 block 3.0 8.0
+set type 18 block 5.0 8.0
+set type 19 block 8.0 8.0
+set type 20 block 16.0 16.0
+set type 21 block 2.0 16.0
+set type 22 block 3.0 16.0
+set type 23 block 5.0 16.0
+set type 24 block 8.0 16.0
+set type 25 block 16.0 16.0
+
+run 0
+write_dump all image super-table.ppm type type ellipsoid atom 1 6 0.2 size 600 600 &
+       zoom 1.94872 shiny 0.6 fsaa yes view 30 0 box no 0.0 &
+       modify boxcolor yellow backcolor black backcolor2 white
diff --git a/python/lammps/mliap/pytorch.py b/python/lammps/mliap/pytorch.py
index 71ce83d6407..0f79a7b5cc6 100644
--- a/python/lammps/mliap/pytorch.py
+++ b/python/lammps/mliap/pytorch.py
@@ -135,6 +135,7 @@ def forward(self, elems, descriptors, beta, energy,use_gpu_data=False):
         device = self.device
         if (use_gpu_data and (device is None) and (str(beta.device).find('CUDA') == 1)):
             device = 'cuda' #Override device as it wasn't defined in the model
+            self.model = self.model.to(device)
         with torch.autograd.enable_grad():
 
             if (use_gpu_data):
diff --git a/src/ASPHERE/compute_erotate_asphere.cpp b/src/ASPHERE/compute_erotate_asphere.cpp
index 4608b41319e..aa016a24459 100644
--- a/src/ASPHERE/compute_erotate_asphere.cpp
+++ b/src/ASPHERE/compute_erotate_asphere.cpp
@@ -102,7 +102,7 @@ double ComputeERotateAsphere::compute_scalar()
   // no point particles since divide by inertia
 
   double length;
-  double *shape, *quat, *block;
+  double *quat;
   double wbody[3], inertia[3];
   double rot[3][3];
   double erotate = 0.0;
@@ -112,15 +112,13 @@ double ComputeERotateAsphere::compute_scalar()
       if (ellipsoid && (ebonus || ebonus_super) && (ellipsoid[i] >= 0)) {
 
         if (atom->superellipsoid_flag) {
-          shape = ebonus_super[ellipsoid[i]].shape;
           quat = ebonus_super[ellipsoid[i]].quat;
-          block = ebonus_super[ellipsoid[i]].block;
-          // principal moments of inertia
+          // principal moments of inertia are pre-computed
           inertia[0] = ebonus_super[ellipsoid[i]].inertia[0];
           inertia[1] = ebonus_super[ellipsoid[i]].inertia[1];
           inertia[2] = ebonus_super[ellipsoid[i]].inertia[2];
         } else {
-          shape = ebonus[ellipsoid[i]].shape;
+          auto *shape = ebonus[ellipsoid[i]].shape;
           quat = ebonus[ellipsoid[i]].quat;
           // principal moments of inertia
           inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0;
diff --git a/src/ASPHERE/math_extra_superellipsoids.cpp b/src/ASPHERE/math_extra_superellipsoids.cpp
index d32a396fa2f..43b39943265 100644
--- a/src/ASPHERE/math_extra_superellipsoids.cpp
+++ b/src/ASPHERE/math_extra_superellipsoids.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -13,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Jacopo Bilotto (EPFL), Jibril B. Coulibaly
+   Contributing authors: Jacopo Bilotto (EPFL), Jibril B. Coulibaly
 ------------------------------------------------------------------------- */
 
 #include "math_extra_superellipsoids.h"
@@ -23,26 +22,28 @@
 
 #include <cmath>
 
-namespace MathExtraSuperellipsoids {
+namespace {
+constexpr double TIKHONOV_SCALE = 1e-14;
 
-inline constexpr double TIKHONOV_SCALE =
-    1e-14;
+constexpr int ITERMAX_NR = 100;
+constexpr double TOL_NR_RES = 1e-10 * 1e-10;
+constexpr double TOL_NR_POS = 1e-6 * 1e-6;
 
-static constexpr int ITERMAX_NR = 100;
-static constexpr double TOL_NR_RES = 1e-10 * 1e-10;
-static constexpr double TOL_NR_POS = 1e-6 * 1e-6;
+constexpr int ITERMAX_LS = 10;
+constexpr double PARAMETER_LS = 1e-4;
+constexpr double CUTBACK_LS = 0.5;
 
-static constexpr int ITERMAX_LS = 10;
-static constexpr double PARAMETER_LS = 1e-4;
-static constexpr double CUTBACK_LS = 0.5;
+constexpr double TOL_OVERLAP = 1e-8;
+constexpr unsigned int ITERMAX_OVERLAP = 20;
+constexpr double MINSLOPE_OVERLAP = 1e-12;
 
-static constexpr double TOL_OVERLAP = 1e-8;
-static constexpr unsigned int ITERMAX_OVERLAP = 20;
-static constexpr double MINSLOPE_OVERLAP = 1e-12;
+constexpr double REGULARIZATION_EPSILON = 1e-12;
+constexpr double MAX_B_FAST = 1e30;
+}    // namespace
 
-static constexpr double REGULARIZATION_EPSILON = 1e-12;
-static constexpr double MAX_B_FAST = 1e30;
+// clang-format off
 
+namespace MathExtraSuperellipsoids {
 
 /* ----------------------------------------------------------------------
    curvature of superellipsoid
@@ -54,10 +55,10 @@ double mean_curvature_superellipsoid(const double *shape, const double *block, c
   // this code computes the mean curvature on the superellipsoid surface
   // for the given global point
   double hess[3][3], grad[3], normal[3];
-  double shapefunc, xlocal[3], tmp_v[3];
+  double xlocal[3], tmp_v[3];
   MathExtra::sub3(surf_global_point, xc, tmp_v); // here tmp_v is the vector from center to surface point
   MathExtra::transpose_matvec(R, tmp_v, xlocal);
-  shapefunc = shape_and_derivatives_local(xlocal, shape, block, flag, grad, hess); // computation of curvature is independent of local or global frame
+  (void) shape_and_derivatives_local(xlocal, shape, block, flag, grad, hess); // computation of curvature is independent of local or global frame
   MathExtra::normalize3(grad, normal);
   MathExtra::matvec(hess, normal, tmp_v); // here tmp_v is intermediate product
   double F_mag = sqrt(MathExtra::dot3(grad, grad));
@@ -70,10 +71,10 @@ double gaussian_curvature_superellipsoid(const double *shape, const double *bloc
   // this code computes the gaussian curvature coefficient
   // for the given global point
   double hess[3][3], grad[3], normal[3];
-  double shapefunc, xlocal[3], tmp_v[3];
+  double xlocal[3], tmp_v[3];
   MathExtra::sub3(surf_global_point, xc, tmp_v); // here tmp_v is the vector from center to surface point
   MathExtra::transpose_matvec(R, tmp_v, xlocal);
-  shapefunc = shape_and_derivatives_local(xlocal, shape, block, flag, grad, hess); // computation of curvature is independent of local or global frame
+  (void) shape_and_derivatives_local(xlocal, shape, block, flag, grad, hess); // computation of curvature is independent of local or global frame
   MathExtra::normalize3(grad, normal);
 
   double temp[3];
@@ -126,12 +127,12 @@ void global2local_vector(const double *v, const double *quat, double *local_v)
 void apply_regularization_shape_function(double n1, const double avg_radius, double *value, double *grad, double hess[3][3])
 {
   // value is F - 1
-  double base = std::fmax(*value + 1.0, REGULARIZATION_EPSILON);
+  double base = fmax(*value + 1.0, REGULARIZATION_EPSILON);
   const double inv_F = 1.0 / base;
   const double inv_n1 = 1.0 / n1;
 
   // P = base^(1/n)
-  const double F_pow_inv_n1 = std::pow(base, inv_n1);
+  const double F_pow_inv_n1 = pow(base, inv_n1);
 
   // Scale for Gradient: S1 = R * (1/n) * base^(1/n - 1)
   const double scale_grad = avg_radius * inv_n1 * F_pow_inv_n1 * inv_F;
@@ -162,7 +163,7 @@ void apply_regularization_shape_function(double n1, const double avg_radius, dou
 
 double shape_and_derivatives_local(const double* xlocal, const double* shape, const double* block, const int flag, double* grad, double hess[3][3])
 {
-  double shapefunc;
+  double shapefunc = 0.0;
   // TODO: Not sure how to make flag values more clear
   // Cannot forward declare the enum AtomVecEllipsoid::BlockType
   // Could use scoped (enum class) but no implicit conversion:
@@ -199,21 +200,21 @@ double shape_and_derivatives_local_superquad(const double* xlocal, const double*
   double a_inv = 1.0 / shape[0];
   double b_inv = 1.0 / shape[1];
   double c_inv = 1.0 / shape[2];
-  double x_a = std::fabs(xlocal[0] * a_inv);
-  double y_b = std::fabs(xlocal[1] * b_inv);
-  double z_c = std::fabs(xlocal[2] * c_inv);
+  double x_a = fabs(xlocal[0] * a_inv);
+  double y_b = fabs(xlocal[1] * b_inv);
+  double z_c = fabs(xlocal[2] * c_inv);
   double n1 = block[0];
   double n2 = block[1];
-  double x_a_pow_n2_m2 = std::pow(x_a, n2 - 2.0);
+  double x_a_pow_n2_m2 = pow(x_a, n2 - 2.0);
   double x_a_pow_n2_m1 = x_a_pow_n2_m2 * x_a;
-  double y_b_pow_n2_m2 = std::pow(y_b, n2 - 2.0);
+  double y_b_pow_n2_m2 = pow(y_b, n2 - 2.0);
   double y_b_pow_n2_m1 = y_b_pow_n2_m2 * y_b;
 
   double nu = (x_a_pow_n2_m1 * x_a) + (y_b_pow_n2_m1 * y_b);
-  double nu_pow_n1_n2_m2 = std::pow(nu, n1/n2 - 2.0);
+  double nu_pow_n1_n2_m2 = pow(nu, n1/n2 - 2.0);
   double nu_pow_n1_n2_m1 = nu_pow_n1_n2_m2 * nu;
 
-  double z_c_pow_n1_m2 = std::pow(z_c, n1 -2.0);
+  double z_c_pow_n1_m2 = pow(z_c, n1 -2.0);
   double z_c_pow_n1_m1 = z_c_pow_n1_m2 * z_c;
 
   // Equation (14)
@@ -246,14 +247,14 @@ double shape_and_derivatives_local_n1equaln2(const double* xlocal, const double*
   double a_inv = 1.0 / shape[0];
   double b_inv = 1.0 / shape[1];
   double c_inv = 1.0 / shape[2];
-  double x_a = std::fabs(xlocal[0] * a_inv);
-  double y_b = std::fabs(xlocal[1] * b_inv);
-  double z_c = std::fabs(xlocal[2] * c_inv);
-  double x_a_pow_n_m2 = std::pow(x_a, n - 2.0);
+  double x_a = fabs(xlocal[0] * a_inv);
+  double y_b = fabs(xlocal[1] * b_inv);
+  double z_c = fabs(xlocal[2] * c_inv);
+  double x_a_pow_n_m2 = pow(x_a, n - 2.0);
   double x_a_pow_n_m1 = x_a_pow_n_m2 * x_a;
-  double y_b_pow_n_m2 = std::pow(y_b, n - 2.0);
+  double y_b_pow_n_m2 = pow(y_b, n - 2.0);
   double y_b_pow_n_m1 = y_b_pow_n_m2 * y_b;
-  double z_c_pow_n_m2 = std::pow(z_c, n - 2.0);
+  double z_c_pow_n_m2 = pow(z_c, n - 2.0);
   double z_c_pow_n_m1 = z_c_pow_n_m2 * z_c;
 
   // Equation (14)
@@ -265,7 +266,6 @@ double shape_and_derivatives_local_n1equaln2(const double* xlocal, const double*
   grad[2] = n * c_inv * z_c_pow_n_m1 * signz;
 
   // Equation (15)
-  double signxy = signx * signy;
   hess[0][0] = a_inv * a_inv * n * (n - 1.0) * x_a_pow_n_m2;
   hess[1][1] = b_inv * b_inv * n * (n - 1.0) * y_b_pow_n_m2;
   hess[2][2] = c_inv * c_inv * n * (n - 1.0) * z_c_pow_n_m2;
@@ -428,7 +428,7 @@ int determine_contact_point(const double* xci, const double Ri[3][3], const doub
   if (norm < TOL_NR_RES) {
 
     //  must compute the normal vector nij before returning since the Newton loop normally handles this upon convergence.
-    double xilocal[3], tmp_v[3], gradi[3], val_dummy;
+    double xilocal[3], tmp_v[3], gradi[3];
 
     // Transform global X0 to local frame of particle I
     MathExtra::sub3(X0, xci, tmp_v);
@@ -439,9 +439,9 @@ int determine_contact_point(const double* xci, const double Ri[3][3], const doub
     // TODO: might use a simpler function to simply compute the gradient, to
     // avoid computing quantities already computed in compute_residual_and_jacobian
     if (flagi <= 1)
-      val_dummy = shape_and_gradient_local_n1equaln2_surfacesearch(xilocal, shapei, blocki[0], tmp_v);
+      (void) shape_and_gradient_local_n1equaln2_surfacesearch(xilocal, shapei, blocki[0], tmp_v);
     else
-      val_dummy = shape_and_gradient_local_superquad_surfacesearch(xilocal, shapei, blocki, tmp_v);
+      (void) shape_and_gradient_local_superquad_surfacesearch(xilocal, shapei, blocki, tmp_v);
 
     // Rotate gradient back to global frame to get normal
     MathExtra::matvec(Ri, tmp_v, gradi);
@@ -475,11 +475,11 @@ int determine_contact_point(const double* xci, const double Ri[3][3], const doub
 
     // check for divergence or numerical issues in the fast solver
     // and fall back to regularized solver if necessary
-    bool fail0 = !std::isfinite(b_fast[0]) | (std::abs(b_fast[0]) > MAX_B_FAST);
-    bool fail1 = !std::isfinite(b_fast[1]) | (std::abs(b_fast[1]) > MAX_B_FAST);
-    bool fail2 = !std::isfinite(b_fast[2]) | (std::abs(b_fast[2]) > MAX_B_FAST);
-    bool fail3 = !std::isfinite(b_fast[3]) | (std::abs(b_fast[3]) > MAX_B_FAST);
-    if (fail0 | fail1 | fail2 | fail3) {
+    bool fail0 = !std::isfinite(b_fast[0]) || (fabs(b_fast[0]) > MAX_B_FAST);
+    bool fail1 = !std::isfinite(b_fast[1]) || (fabs(b_fast[1]) > MAX_B_FAST);
+    bool fail2 = !std::isfinite(b_fast[2]) || (fabs(b_fast[2]) > MAX_B_FAST);
+    bool fail3 = !std::isfinite(b_fast[3]) || (fabs(b_fast[3]) > MAX_B_FAST);
+    if (fail0 || fail1 || fail2 || fail3) {
         gauss_elim_solved = false;
     }
 
@@ -496,7 +496,7 @@ int determine_contact_point(const double* xci, const double Ri[3][3], const doub
       b_fast[2] = -residual[2]; b_fast[3] = -residual[3];
        // enforce a minimum regularization to avoid zero pivots in edge cases (flat on flat)
       double trace = jacobian[0] + jacobian[5] + jacobian[10];
-      double diag_weight = std::fmax(TIKHONOV_SCALE * trace, TIKHONOV_SCALE);
+      double diag_weight = fmax(TIKHONOV_SCALE * trace, TIKHONOV_SCALE);
       A_fast[0]  += diag_weight;
       A_fast[5]  += diag_weight;
       A_fast[10] += diag_weight;
@@ -517,7 +517,7 @@ int determine_contact_point(const double* xci, const double Ri[3][3], const doub
 
     // Limit the max step size to avoid jumping too far
     // normalize residual vector if step was limited
-    double spatial_residual_norm = std::sqrt(rhs[0]*rhs[0] + rhs[1]*rhs[1] + rhs[2]*rhs[2]);
+    double spatial_residual_norm = sqrt(rhs[0]*rhs[0] + rhs[1]*rhs[1] + rhs[2]*rhs[2]);
 
     if (spatial_residual_norm > max_step) {
         double scale = max_step / spatial_residual_norm;
@@ -658,21 +658,21 @@ double shape_and_gradient_local_superquad_surfacesearch(const double* xlocal, co
   double a_inv = 1.0 / shape[0];
   double b_inv = 1.0 / shape[1];
   double c_inv = 1.0 / shape[2];
-  double x_a = std::fabs(xlocal[0] * a_inv);
-  double y_b = std::fabs(xlocal[1] * b_inv);
-  double z_c = std::fabs(xlocal[2] * c_inv);
+  double x_a = fabs(xlocal[0] * a_inv);
+  double y_b = fabs(xlocal[1] * b_inv);
+  double z_c = fabs(xlocal[2] * c_inv);
   double n1 = block[0];
   double n2 = block[1];
-  double x_a_pow_n2_m2 = std::pow(x_a, n2 - 2.0);
+  double x_a_pow_n2_m2 = pow(x_a, n2 - 2.0);
   double x_a_pow_n2_m1 = x_a_pow_n2_m2 * x_a;
-  double y_b_pow_n2_m2 = std::pow(y_b, n2 - 2.0);
+  double y_b_pow_n2_m2 = pow(y_b, n2 - 2.0);
   double y_b_pow_n2_m1 = y_b_pow_n2_m2 * y_b;
 
   double nu = (x_a_pow_n2_m1 * x_a) + (y_b_pow_n2_m1 * y_b);
-  double nu_pow_n1_n2_m2 = std::pow(nu, n1/n2 - 2.0);
+  double nu_pow_n1_n2_m2 = pow(nu, n1/n2 - 2.0);
   double nu_pow_n1_n2_m1 = nu_pow_n1_n2_m2 * nu;
 
-  double z_c_pow_n1_m2 = std::pow(z_c, n1 -2.0);
+  double z_c_pow_n1_m2 = pow(z_c, n1 -2.0);
   double z_c_pow_n1_m1 = z_c_pow_n1_m2 * z_c;
 
   // Equation (14)
@@ -685,13 +685,13 @@ double shape_and_gradient_local_superquad_surfacesearch(const double* xlocal, co
 
   double F = (nu_pow_n1_n2_m1 * nu) + (z_c_pow_n1_m1 * z_c);
 
-  double scale_factor = std::pow(F, 1.0/n1 -1.0) / n1;
+  double scale_factor = pow(F, 1.0/n1 -1.0) / n1;
 
   grad[0] *= scale_factor;
   grad[1] *= scale_factor;
   grad[2] *= scale_factor;
 
-  return std::pow(F, 1.0/n1) - 1.0;
+  return pow(F, 1.0/n1) - 1.0;
 }
 
 /* ----------------------------------------------------------------------
@@ -703,14 +703,14 @@ double shape_and_gradient_local_n1equaln2_surfacesearch(const double* xlocal, co
   double a_inv = 1.0 / shape[0];
   double b_inv = 1.0 / shape[1];
   double c_inv = 1.0 / shape[2];
-  double x_a = std::fabs(xlocal[0] * a_inv);
-  double y_b = std::fabs(xlocal[1] * b_inv);
-  double z_c = std::fabs(xlocal[2] * c_inv);
-  double x_a_pow_n_m2 = std::pow(x_a, n - 2.0);
+  double x_a = fabs(xlocal[0] * a_inv);
+  double y_b = fabs(xlocal[1] * b_inv);
+  double z_c = fabs(xlocal[2] * c_inv);
+  double x_a_pow_n_m2 = pow(x_a, n - 2.0);
   double x_a_pow_n_m1 = x_a_pow_n_m2 * x_a;
-  double y_b_pow_n_m2 = std::pow(y_b, n - 2.0);
+  double y_b_pow_n_m2 = pow(y_b, n - 2.0);
   double y_b_pow_n_m1 = y_b_pow_n_m2 * y_b;
-  double z_c_pow_n_m2 = std::pow(z_c, n - 2.0);
+  double z_c_pow_n_m2 = pow(z_c, n - 2.0);
   double z_c_pow_n_m1 = z_c_pow_n_m2 * z_c;
 
   // Equation (14)
@@ -722,13 +722,13 @@ double shape_and_gradient_local_n1equaln2_surfacesearch(const double* xlocal, co
   grad[2] = n * c_inv * z_c_pow_n_m1 * signz;
 
   double F = (x_a_pow_n_m1 * x_a) + (y_b_pow_n_m1 * y_b) + (z_c_pow_n_m1 * z_c);
-  double scale_factor = std::pow(F, 1.0/n -1.0) / n;
+  double scale_factor = pow(F, 1.0/n -1.0) / n;
 
   grad[0] *= scale_factor;
   grad[1] *= scale_factor;
   grad[2] *= scale_factor;
 
-  return std::pow(F, 1.0/n) - 1.0;
+  return pow(F, 1.0/n) - 1.0;
 }
 
 /* ----------------------------------------------------------------------
@@ -747,7 +747,6 @@ double compute_overlap_distance(
   MathExtra::transpose_matvec(Rot, del, local_point);
   MathExtra::transpose_matvec(Rot, global_normal, local_normal);
 
-  double local_f;
   double local_grad[3];
 
   // elliposid analytical solution, might need to double check the math
@@ -780,7 +779,7 @@ double compute_overlap_distance(
 
     // Clamp delta to zero just in case numerical noise makes it negative
     if (delta < 0.0) delta = 0.0;
-    overlap = (-B + std::sqrt(delta)) / (2.0 * A);
+    overlap = (-B + sqrt(delta)) / (2.0 * A);
   } else {
     // --- Superquadric Case (Newton-Raphson on Distance Estimator) ---
 
@@ -801,7 +800,7 @@ double compute_overlap_distance(
       }
 
       // Convergence Check
-      if (std::fabs(val) < TOL_OVERLAP) break;
+      if (fabs(val) < TOL_OVERLAP) break;
 
       // Newton Step
       double slope = local_grad[0] * local_normal[0] +
@@ -809,7 +808,7 @@ double compute_overlap_distance(
                      local_grad[2] * local_normal[2];
 
       // Safety check to prevent divide-by-zero if ray grazes surface
-      if (std::fabs(slope) < MINSLOPE_OVERLAP) break;
+      if (fabs(slope) < MINSLOPE_OVERLAP) break;
 
       overlap -= val / slope;
     }
diff --git a/src/ASPHERE/math_extra_superellipsoids.h b/src/ASPHERE/math_extra_superellipsoids.h
index 375f7c40ccd..755a1200744 100644
--- a/src/ASPHERE/math_extra_superellipsoids.h
+++ b/src/ASPHERE/math_extra_superellipsoids.h
@@ -21,8 +21,8 @@
 #include "math_extra.h"
 
 #include <cmath>
-#include <iostream>
 #include <limits>
+#include <utility>
 
 namespace MathExtraSuperellipsoids {
 
@@ -39,9 +39,9 @@ inline double det4_M44_zero(const double m[4][4]);
 inline bool solve_4x4_robust_unrolled(double A[16], double b[4]);
 
 inline int check_oriented_bounding_boxes(const double *xc1, const double R1[3][3],
-                                          const double *shape1, const double *xc2,
-                                          const double R2[3][3], const double *shape2,
-                                          int cached_axis);
+                                         const double *shape1, const double *xc2,
+                                         const double R2[3][3], const double *shape2,
+                                         int cached_axis);
 
 inline bool check_intersection_axis(const int axis_id, const double C[3][3],
                                     const double AbsC[3][3], const double *center_distance_box1,
@@ -145,20 +145,20 @@ inline bool MathExtraSuperellipsoids::solve_4x4_robust_unrolled(double A[16], do
   // --- COLUMN 0 ---
   // 1. Find Pivot in Col 0
   int p = 0;
-  double max_val = std::abs(A[0]);
+  double max_val = fabs(A[0]);
   double val;
 
-  val = std::abs(A[4]);
+  val = fabs(A[4]);
   if (val > max_val) {
     max_val = val;
     p = 1;
   }
-  val = std::abs(A[8]);
+  val = fabs(A[8]);
   if (val > max_val) {
     max_val = val;
     p = 2;
   }
-  val = std::abs(A[12]);
+  val = fabs(A[12]);
   if (val > max_val) {
     max_val = val;
     p = 3;
@@ -201,14 +201,14 @@ inline bool MathExtraSuperellipsoids::solve_4x4_robust_unrolled(double A[16], do
   // --- COLUMN 1 ---
   // 1. Find Pivot in Col 1 (starting from row 1)
   p = 1;
-  max_val = std::abs(A[5]);
+  max_val = fabs(A[5]);
 
-  val = std::abs(A[9]);
+  val = fabs(A[9]);
   if (val > max_val) {
     max_val = val;
     p = 2;
   }
-  val = std::abs(A[13]);
+  val = fabs(A[13]);
   if (val > max_val) {
     max_val = val;
     p = 3;
@@ -244,9 +244,9 @@ inline bool MathExtraSuperellipsoids::solve_4x4_robust_unrolled(double A[16], do
   // --- COLUMN 2 ---
   // 1. Find Pivot in Col 2 (starting from row 2)
   p = 2;
-  max_val = std::abs(A[10]);
+  max_val = fabs(A[10]);
 
-  val = std::abs(A[14]);
+  val = fabs(A[14]);
   if (val > max_val) {
     max_val = val;
     p = 3;
@@ -273,7 +273,7 @@ inline bool MathExtraSuperellipsoids::solve_4x4_robust_unrolled(double A[16], do
 
   // --- BACKWARD SUBSTITUTION ---
   // Check last pivot
-  if (std::abs(A[15]) <= 0.0) return false;
+  if (fabs(A[15]) <= 0.0) return false;
 
   double inv3 = 1.0 / A[15];
   b[3] *= inv3;
@@ -311,7 +311,7 @@ inline int MathExtraSuperellipsoids::check_oriented_bounding_boxes(
   MathExtra::transpose_times3(R1, R2, C);    // C = R1^T * R2
   for (unsigned int i = 0; i < 3; i++) {
     for (unsigned int j = 0; j < 3; j++) {
-      AbsC[i][j] = std::fabs(C[i][j]);    // for when absolute values are needed
+      AbsC[i][j] = fabs(C[i][j]);    // for when absolute values are needed
     }
   }
 
@@ -333,8 +333,7 @@ inline int MathExtraSuperellipsoids::check_oriented_bounding_boxes(
     if (axis_id == axis) continue;    // already checked
     separated = check_intersection_axis(axis_id, C, AbsC, center_distance_box1,
                                         center_distance_box2, shape1, shape2);
-    if (separated)
-      return axis_id; // update cached axis
+    if (separated) return axis_id;    // update cached axis
   }
   return -1;    // no separation found
 }
@@ -358,78 +357,80 @@ inline bool MathExtraSuperellipsoids::check_intersection_axis(const int axis_id,
     case 0:    // A0
       R1 = a[0];
       R2 = b[0] * AbsC[0][0] + b[1] * AbsC[0][1] + b[2] * AbsC[0][2];
-      R = std::fabs(center_distance_box1[0]);
+      R = fabs(center_distance_box1[0]);
       break;
     case 1:    // A1
       R1 = a[1];
       R2 = b[0] * AbsC[1][0] + b[1] * AbsC[1][1] + b[2] * AbsC[1][2];
-      R = std::fabs(center_distance_box1[1]);
+      R = fabs(center_distance_box1[1]);
       break;
     case 2:    // A2
       R1 = a[2];
       R2 = b[0] * AbsC[2][0] + b[1] * AbsC[2][1] + b[2] * AbsC[2][2];
-      R = std::fabs(center_distance_box1[2]);
+      R = fabs(center_distance_box1[2]);
       break;
     case 3:    // B0
       R1 = a[0] * AbsC[0][0] + a[1] * AbsC[1][0] + a[2] * AbsC[2][0];
       R2 = b[0];
-      R = std::fabs(center_distance_box2[0]);
+      R = fabs(center_distance_box2[0]);
       break;
     case 4:    // B1
       R1 = a[0] * AbsC[0][1] + a[1] * AbsC[1][1] + a[2] * AbsC[2][1];
       R2 = b[1];
-      R = std::fabs(center_distance_box2[1]);
+      R = fabs(center_distance_box2[1]);
       break;
     case 5:    // B2
       R1 = a[0] * AbsC[0][2] + a[1] * AbsC[1][2] + a[2] * AbsC[2][2];
       R2 = b[2];
-      R = std::fabs(center_distance_box2[2]);
+      R = fabs(center_distance_box2[2]);
       break;
     case 6:    // A0 x B0
       R1 = a[1] * AbsC[2][0] + a[2] * AbsC[1][0];
       R2 = b[1] * AbsC[0][2] + b[2] * AbsC[0][1];
-      R = std::fabs(center_distance_box1[2] * C[1][0] - center_distance_box1[1] * C[2][0]);
+      R = fabs(center_distance_box1[2] * C[1][0] - center_distance_box1[1] * C[2][0]);
       break;
     case 7:    // A0 x B1
       R1 = a[1] * AbsC[2][1] + a[2] * AbsC[1][1];
       R2 = b[0] * AbsC[0][2] + b[2] * AbsC[0][0];
-      R = std::fabs(center_distance_box1[2] * C[1][1] - center_distance_box1[1] * C[2][1]);
+      R = fabs(center_distance_box1[2] * C[1][1] - center_distance_box1[1] * C[2][1]);
       break;
     case 8:    // A0 x B2
       R1 = a[1] * AbsC[2][2] + a[2] * AbsC[1][2];
       R2 = b[0] * AbsC[0][1] + b[1] * AbsC[0][0];
-      R = std::fabs(center_distance_box1[2] * C[1][2] - center_distance_box1[1] * C[2][2]);
+      R = fabs(center_distance_box1[2] * C[1][2] - center_distance_box1[1] * C[2][2]);
       break;
     case 9:    // A1 x B0
       R1 = a[0] * AbsC[2][0] + a[2] * AbsC[0][0];
       R2 = b[1] * AbsC[1][2] + b[2] * AbsC[1][1];
-      R = std::fabs(center_distance_box1[0] * C[2][0] - center_distance_box1[2] * C[0][0]);
+      R = fabs(center_distance_box1[0] * C[2][0] - center_distance_box1[2] * C[0][0]);
       break;
     case 10:    // A1 x B1
       R1 = a[0] * AbsC[2][1] + a[2] * AbsC[0][1];
       R2 = b[0] * AbsC[1][2] + b[2] * AbsC[1][0];
-      R = std::fabs(center_distance_box1[0] * C[2][1] - center_distance_box1[2] * C[0][1]);
+      R = fabs(center_distance_box1[0] * C[2][1] - center_distance_box1[2] * C[0][1]);
       break;
     case 11:    // A1 x B2
       R1 = a[0] * AbsC[2][2] + a[2] * AbsC[0][2];
       R2 = b[0] * AbsC[1][1] + b[1] * AbsC[1][0];
-      R = std::fabs(center_distance_box1[0] * C[2][2] - center_distance_box1[2] * C[0][2]);
+      R = fabs(center_distance_box1[0] * C[2][2] - center_distance_box1[2] * C[0][2]);
       break;
     case 12:    // A2 x B0
       R1 = a[0] * AbsC[1][0] + a[1] * AbsC[0][0];
       R2 = b[1] * AbsC[2][2] + b[2] * AbsC[2][1];
-      R = std::fabs(center_distance_box1[1] * C[0][0] - center_distance_box1[0] * C[1][0]);
+      R = fabs(center_distance_box1[1] * C[0][0] - center_distance_box1[0] * C[1][0]);
       break;
     case 13:    // A2 x B1
       R1 = a[0] * AbsC[1][1] + a[1] * AbsC[0][1];
       R2 = b[0] * AbsC[2][2] + b[2] * AbsC[2][0];
-      R = std::fabs(center_distance_box1[1] * C[0][1] - center_distance_box1[0] * C[1][1]);
+      R = fabs(center_distance_box1[1] * C[0][1] - center_distance_box1[0] * C[1][1]);
       break;
     case 14:    // A2 x B2
       R1 = a[0] * AbsC[1][2] + a[1] * AbsC[0][2];
       R2 = b[0] * AbsC[2][1] + b[1] * AbsC[2][0];
-      R = std::fabs(center_distance_box1[1] * C[0][2] - center_distance_box1[0] * C[1][2]);
+      R = fabs(center_distance_box1[1] * C[0][2] - center_distance_box1[0] * C[1][2]);
       break;
+    default:    // should not happen
+      return false;
   }
 
   if (R > R1 + R2) {
@@ -456,7 +457,7 @@ inline bool MathExtraSuperellipsoids::check_intersection_axis_and_get_seed(
   for (unsigned int i = 0; i < 3; i++) {
     for (unsigned int j = 0; j < 3; j++) {
       // Add epsilon to prevent division by zero in edge cases
-      AbsC[i][j] = std::fabs(C[i][j]) + eps;
+      AbsC[i][j] = fabs(C[i][j]) + eps;
     }
   }
 
@@ -483,77 +484,77 @@ inline bool MathExtraSuperellipsoids::check_intersection_axis_and_get_seed(
       case 0:    // A0
         R1_rad = shape1[0];
         R2_rad = shape2[0] * AbsC[0][0] + shape2[1] * AbsC[0][1] + shape2[2] * AbsC[0][2];
-        dist = std::fabs(center_distance_box1[0]);
+        dist = fabs(center_distance_box1[0]);
         break;
       case 1:    // A1
         R1_rad = shape1[1];
         R2_rad = shape2[0] * AbsC[1][0] + shape2[1] * AbsC[1][1] + shape2[2] * AbsC[1][2];
-        dist = std::fabs(center_distance_box1[1]);
+        dist = fabs(center_distance_box1[1]);
         break;
       case 2:    // A2
         R1_rad = shape1[2];
         R2_rad = shape2[0] * AbsC[2][0] + shape2[1] * AbsC[2][1] + shape2[2] * AbsC[2][2];
-        dist = std::fabs(center_distance_box1[2]);
+        dist = fabs(center_distance_box1[2]);
         break;
       case 3:    // B0
         R1_rad = shape1[0] * AbsC[0][0] + shape1[1] * AbsC[1][0] + shape1[2] * AbsC[2][0];
         R2_rad = shape2[0];
-        dist = std::fabs(center_distance_box2[0]);
+        dist = fabs(center_distance_box2[0]);
         break;
       case 4:    // B1
         R1_rad = shape1[0] * AbsC[0][1] + shape1[1] * AbsC[1][1] + shape1[2] * AbsC[2][1];
         R2_rad = shape2[1];
-        dist = std::fabs(center_distance_box2[1]);
+        dist = fabs(center_distance_box2[1]);
         break;
       case 5:    // B2
         R1_rad = shape1[0] * AbsC[0][2] + shape1[1] * AbsC[1][2] + shape1[2] * AbsC[2][2];
         R2_rad = shape2[2];
-        dist = std::fabs(center_distance_box2[2]);
+        dist = fabs(center_distance_box2[2]);
         break;
       case 6:    // A0 x B0
         R1_rad = shape1[1] * AbsC[2][0] + shape1[2] * AbsC[1][0];
         R2_rad = shape2[1] * AbsC[0][2] + shape2[2] * AbsC[0][1];
-        dist = std::fabs(center_distance_box1[2] * C[1][0] - center_distance_box1[1] * C[2][0]);
+        dist = fabs(center_distance_box1[2] * C[1][0] - center_distance_box1[1] * C[2][0]);
         break;
       case 7:    // A0 x B1
         R1_rad = shape1[1] * AbsC[2][1] + shape1[2] * AbsC[1][1];
         R2_rad = shape2[0] * AbsC[0][2] + shape2[2] * AbsC[0][0];
-        dist = std::fabs(center_distance_box1[2] * C[1][1] - center_distance_box1[1] * C[2][1]);
+        dist = fabs(center_distance_box1[2] * C[1][1] - center_distance_box1[1] * C[2][1]);
         break;
       case 8:    // A0 x B2
         R1_rad = shape1[1] * AbsC[2][2] + shape1[2] * AbsC[1][2];
         R2_rad = shape2[0] * AbsC[0][1] + shape2[1] * AbsC[0][0];
-        dist = std::fabs(center_distance_box1[2] * C[1][2] - center_distance_box1[1] * C[2][2]);
+        dist = fabs(center_distance_box1[2] * C[1][2] - center_distance_box1[1] * C[2][2]);
         break;
       case 9:    // A1 x B0
         R1_rad = shape1[0] * AbsC[2][0] + shape1[2] * AbsC[0][0];
         R2_rad = shape2[1] * AbsC[1][2] + shape2[2] * AbsC[1][1];
-        dist = std::fabs(center_distance_box1[0] * C[2][0] - center_distance_box1[2] * C[0][0]);
+        dist = fabs(center_distance_box1[0] * C[2][0] - center_distance_box1[2] * C[0][0]);
         break;
       case 10:    // A1 x B1
         R1_rad = shape1[0] * AbsC[2][1] + shape1[2] * AbsC[0][1];
         R2_rad = shape2[0] * AbsC[1][2] + shape2[2] * AbsC[1][0];
-        dist = std::fabs(center_distance_box1[0] * C[2][1] - center_distance_box1[2] * C[0][1]);
+        dist = fabs(center_distance_box1[0] * C[2][1] - center_distance_box1[2] * C[0][1]);
         break;
       case 11:    // A1 x B2
         R1_rad = shape1[0] * AbsC[2][2] + shape1[2] * AbsC[0][2];
         R2_rad = shape2[0] * AbsC[1][1] + shape2[1] * AbsC[1][0];
-        dist = std::fabs(center_distance_box1[0] * C[2][2] - center_distance_box1[2] * C[0][2]);
+        dist = fabs(center_distance_box1[0] * C[2][2] - center_distance_box1[2] * C[0][2]);
         break;
       case 12:    // A2 x B0
         R1_rad = shape1[0] * AbsC[1][0] + shape1[1] * AbsC[0][0];
         R2_rad = shape2[1] * AbsC[2][2] + shape2[2] * AbsC[2][1];
-        dist = std::fabs(center_distance_box1[1] * C[0][0] - center_distance_box1[0] * C[1][0]);
+        dist = fabs(center_distance_box1[1] * C[0][0] - center_distance_box1[0] * C[1][0]);
         break;
       case 13:    // A2 x B1
         R1_rad = shape1[0] * AbsC[1][1] + shape1[1] * AbsC[0][1];
         R2_rad = shape2[0] * AbsC[2][2] + shape2[2] * AbsC[2][0];
-        dist = std::fabs(center_distance_box1[1] * C[0][1] - center_distance_box1[0] * C[1][1]);
+        dist = fabs(center_distance_box1[1] * C[0][1] - center_distance_box1[0] * C[1][1]);
         break;
       case 14:    // A2 x B2
         R1_rad = shape1[0] * AbsC[1][2] + shape1[1] * AbsC[0][2];
         R2_rad = shape2[0] * AbsC[2][1] + shape2[1] * AbsC[2][0];
-        dist = std::fabs(center_distance_box1[1] * C[0][2] - center_distance_box1[0] * C[1][2]);
+        dist = fabs(center_distance_box1[1] * C[0][2] - center_distance_box1[0] * C[1][2]);
         break;
       default:
         return false;
@@ -592,9 +593,7 @@ inline bool MathExtraSuperellipsoids::check_intersection_axis_and_get_seed(
     // Face-to-Face contact logic: Project "Incident" box onto "Reference" face, clip to find overlap center.
     // Pointers to define who is Reference (the face) and who is Incident
     const double *posRef = xc1;
-    const double *posInc = xc2;
-    const double(*RRef)[3] = R1;
-    const double(*RInc)[3] = R2;
+    const double (*RRef)[3] = R1;
     const double *shapeRef = shape1;
     const double *shapeInc = shape2;
     double *D_local_Ref = center_distance_box1;    // Center dist in Ref frame
@@ -604,9 +603,7 @@ inline bool MathExtraSuperellipsoids::check_intersection_axis_and_get_seed(
     // Swap if Reference is Box 2 (Indices 3, 4, 5)
     if (best_axis >= 3) {
       posRef = xc2;
-      posInc = xc1;
       RRef = R2;
-      RInc = R1;
       shapeRef = shape2;
       shapeInc = shape1;
       D_local_Ref = center_distance_box2;
@@ -754,9 +751,9 @@ inline int MathExtraSuperellipsoids::determine_contact_point_wall(
     X0_local[2] = c * c * nz * inv_norm;
   } else {
     // General Superellipsoid
-    double nx_abs = std::fabs(nx);
-    double ny_abs = std::fabs(ny);
-    double nz_abs = std::fabs(nz);
+    double nx_abs = fabs(nx);
+    double ny_abs = fabs(ny);
+    double nz_abs = fabs(nz);
     double n1 = blocki[0];
     double n2 = blocki[1];
 
@@ -771,22 +768,20 @@ inline int MathExtraSuperellipsoids::determine_contact_point_wall(
       double p1 = 1.0 / (n1 - 1.0);
 
       if (nx_abs > ny_abs) {
-        double alpha = std::pow((b * ny_abs) / (a * nx_abs), p2);
-        double gamma = std::pow(1.0 + std::pow(alpha, n2), n1 / n2 - 1.0);
-        double beta = std::pow((c * nz_abs) / (a * nx_abs) * gamma, p1);
+        double alpha = pow((b * ny_abs) / (a * nx_abs), p2);
+        double gamma = pow(1.0 + pow(alpha, n2), n1 / n2 - 1.0);
+        double beta = pow((c * nz_abs) / (a * nx_abs) * gamma, p1);
 
-        double den =
-            std::pow(std::pow(1.0 + std::pow(alpha, n2), n1 / n2) + std::pow(beta, n1), 1.0 / n1);
+        double den = pow(pow(1.0 + pow(alpha, n2), n1 / n2) + pow(beta, n1), 1.0 / n1);
         x = 1.0 / den;
         y = alpha * x;
         z = beta * x;
       } else {
-        double alpha = std::pow((a * nx_abs) / (b * ny_abs), p2);
-        double gamma = std::pow(1.0 + std::pow(alpha, n2), n1 / n2 - 1.0);
-        double beta = std::pow((c * nz_abs) / (b * ny_abs) * gamma, p1);
+        double alpha = pow((a * nx_abs) / (b * ny_abs), p2);
+        double gamma = pow(1.0 + pow(alpha, n2), n1 / n2 - 1.0);
+        double beta = pow((c * nz_abs) / (b * ny_abs) * gamma, p1);
 
-        double den =
-            std::pow(std::pow(1.0 + std::pow(alpha, n2), n1 / n2) + std::pow(beta, n1), 1.0 / n1);
+        double den = pow(pow(1.0 + pow(alpha, n2), n1 / n2) + pow(beta, n1), 1.0 / n1);
         y = 1.0 / den;
         x = alpha * y;
         z = beta * y;
diff --git a/src/ASPHERE/pair_granular_superellipsoid.cpp b/src/ASPHERE/pair_granular_superellipsoid.cpp
index 15b2841e159..91413172d9b 100644
--- a/src/ASPHERE/pair_granular_superellipsoid.cpp
+++ b/src/ASPHERE/pair_granular_superellipsoid.cpp
@@ -50,7 +50,14 @@ static constexpr double MIN_CURVATURE = 1e-12;
 
 /* ---------------------------------------------------------------------- */
 
-PairGranularSuperellipsoid::PairGranularSuperellipsoid(LAMMPS *lmp) : Pair(lmp)
+PairGranularSuperellipsoid::PairGranularSuperellipsoid(LAMMPS *lmp) :
+    Pair(lmp), onerad_dynamic(nullptr), onerad_frozen(nullptr), maxrad_dynamic(nullptr),
+    maxrad_frozen(nullptr), fix_dummy(nullptr), fix_history(nullptr), fix_rigid(nullptr),
+    mass_rigid(nullptr), normal_model(nullptr), damping_model(nullptr), tangential_model(nullptr),
+    limit_damping(nullptr), kn(nullptr), gamman(nullptr), kt(nullptr), xt(nullptr), xmu(nullptr),
+    xi(nullptr), xj(nullptr), vi(nullptr), vj(nullptr), quati(nullptr), quatj(nullptr),
+    angmomi(nullptr), angmomj(nullptr), inertiai(nullptr), inertiaj(nullptr), history_data(nullptr),
+    xref(nullptr), cutoff_type(nullptr)
 {
   single_enable = 1;
   no_virial_fdotr_compute = 1;
@@ -191,10 +198,9 @@ void PairGranularSuperellipsoid::compute(int eflag, int vflag)
 
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
-  int newton_pair = force->newton_pair;
   double *special_lj = force->special_lj;
 
-  auto avec_ellipsoid = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
+  auto *avec_ellipsoid = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
   AtomVecEllipsoid::BonusSuper *bonus = avec_ellipsoid->bonus_super;
   int *ellipsoid = atom->ellipsoid;
 
@@ -244,12 +250,12 @@ void PairGranularSuperellipsoid::compute(int eflag, int vflag)
 
       MathExtra::copy3(bonus[ellipsoid[i]].shape, shapei0);
       MathExtra::copy3(bonus[ellipsoid[j]].shape, shapej0);
-      MathExtra::copy3(bonus[ellipsoid[i]].block, blocki0);
-      MathExtra::copy3(bonus[ellipsoid[j]].block, blockj0);
+      MathExtra::copy2(bonus[ellipsoid[i]].block, blocki0);
+      MathExtra::copy2(bonus[ellipsoid[j]].block, blockj0);
       MathExtra::copy3(bonus[ellipsoid[i]].shape, shapei);
       MathExtra::copy3(bonus[ellipsoid[j]].shape, shapej);
-      MathExtra::copy3(bonus[ellipsoid[i]].block, blocki);
-      MathExtra::copy3(bonus[ellipsoid[j]].block, blockj);
+      MathExtra::copy2(bonus[ellipsoid[i]].block, blocki);
+      MathExtra::copy2(bonus[ellipsoid[j]].block, blockj);
       MathExtra::quat_to_mat(bonus[ellipsoid[i]].quat, Ri);
       MathExtra::quat_to_mat(bonus[ellipsoid[j]].quat, Rj);
 
@@ -425,6 +431,7 @@ void PairGranularSuperellipsoid::coeff(int narg, char **arg)
   }
 
   damping_one = -1;
+  limit_one = 0;
 
   //Parse optional arguments
   while (iarg < narg) {
@@ -788,8 +795,9 @@ void PairGranularSuperellipsoid::reset_dt()
 
 /* ---------------------------------------------------------------------- */
 
-double PairGranularSuperellipsoid::single(int i, int j, int /*itype*/, int /*jtype*/, double rsq,
-                                          double /*factor_coul*/, double factor_lj, double &fforce)
+double PairGranularSuperellipsoid::single(int i, int j, int /*itype*/, int /*jtype*/,
+                                          double /*rsq*/, double /*factor_coul*/, double factor_lj,
+                                          double &fforce)
 {
   if (factor_lj == 0) {
     fforce = 0.0;
@@ -821,8 +829,6 @@ double PairGranularSuperellipsoid::single(int i, int j, int /*itype*/, int /*jty
   xj = atom->x[j];
   radi = atom->radius[i];
   radj = atom->radius[j];
-  itype = itype;
-  jtype = jtype;
   history_data = &allhistory[size_history * neighprev];
   int indx_ref = (atom->tag[i] < atom->tag[j]) ? i : j;
   xref = atom->x[indx_ref];
@@ -830,7 +836,7 @@ double PairGranularSuperellipsoid::single(int i, int j, int /*itype*/, int /*jty
   tagj = atom->tag[j];
   history_update = 0;    // Don't update history
 
-  auto avec_ellipsoid = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
+  auto *avec_ellipsoid = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
   AtomVecEllipsoid::BonusSuper *bonus = avec_ellipsoid->bonus_super;
   int *ellipsoid = atom->ellipsoid;
 
@@ -839,12 +845,12 @@ double PairGranularSuperellipsoid::single(int i, int j, int /*itype*/, int /*jty
 
   MathExtra::copy3(bonus[ellipsoid[i]].shape, shapei0);
   MathExtra::copy3(bonus[ellipsoid[j]].shape, shapej0);
-  MathExtra::copy3(bonus[ellipsoid[i]].block, blocki0);
-  MathExtra::copy3(bonus[ellipsoid[j]].block, blockj0);
+  MathExtra::copy2(bonus[ellipsoid[i]].block, blocki0);
+  MathExtra::copy2(bonus[ellipsoid[j]].block, blockj0);
   MathExtra::copy3(bonus[ellipsoid[i]].shape, shapei);
   MathExtra::copy3(bonus[ellipsoid[j]].shape, shapej);
-  MathExtra::copy3(bonus[ellipsoid[i]].block, blocki);
-  MathExtra::copy3(bonus[ellipsoid[j]].block, blockj);
+  MathExtra::copy2(bonus[ellipsoid[i]].block, blocki);
+  MathExtra::copy2(bonus[ellipsoid[j]].block, blockj);
   MathExtra::quat_to_mat(bonus[ellipsoid[i]].quat, Ri);
   MathExtra::quat_to_mat(bonus[ellipsoid[j]].quat, Rj);
 
@@ -982,7 +988,7 @@ double PairGranularSuperellipsoid::mix_mean(double val1, double val2)
 
 bool PairGranularSuperellipsoid::check_contact()
 {
-  bool touching;
+  bool touching = false;
   if (rsq >= radsum * radsum) {
     touching = false;
   } else {
@@ -1011,8 +1017,6 @@ bool PairGranularSuperellipsoid::check_contact()
       X0[1] = X0_prev[1] + xref[1];
       X0[2] = X0_prev[2] + xref[2];
       X0[3] = X0_prev[3];
-      // std::cout << "Using old contact point as initial guess between particle " << atom->tag[i] << " and particle " << atom->tag[j] << " : "
-      //           << X0[0] << " " << X0[1] << " " << X0[2] << " Lagrange multiplier mu^2: " << X0[3] << std::endl;
       int status = MathExtraSuperellipsoids::determine_contact_point(xi, Ri, shapei, blocki, flagi,
                                                                      xj, Rj, shapej, blockj, flagj,
                                                                      X0, nij, contact_formulation);
diff --git a/src/ASPHERE/pair_granular_superellipsoid.h b/src/ASPHERE/pair_granular_superellipsoid.h
index abd4fa0a468..3ae329c323c 100644
--- a/src/ASPHERE/pair_granular_superellipsoid.h
+++ b/src/ASPHERE/pair_granular_superellipsoid.h
@@ -33,6 +33,7 @@ class PairGranularSuperellipsoid : public Pair {
  public:
   PairGranularSuperellipsoid(class LAMMPS *);
   ~PairGranularSuperellipsoid() override;
+
   void compute(int, int) override;
   void settings(int, char **) override;
   void coeff(int, char **) override;
@@ -42,6 +43,7 @@ class PairGranularSuperellipsoid : public Pair {
   void read_restart(FILE *) override;
   void reset_dt() override;
   double single(int, int, int, int, double, double, double, double &) override;
+
   int pack_forward_comm(int, int *, double *, int, int *) override;
   void unpack_forward_comm(int, int, double *) override;
   double memory_usage() override;
@@ -73,10 +75,10 @@ class PairGranularSuperellipsoid : public Pair {
   int contact_radius_flag;
 
   // Normal coefficients
-  double **kn, **gamman;     // Hooke + Hertz
+  double **kn, **gamman;    // Hooke + Hertz
 
   // Tangential coefficients
-  double **kt, **xt, **xmu;  // linear_history
+  double **kt, **xt, **xmu;    // linear_history
 
   // Intermediate values for contact model
   int history_update, touchjj, itype, jtype;
diff --git a/src/COLVARS/fix_colvars.cpp b/src/COLVARS/fix_colvars.cpp
index 1c9eb966c44..c5b6fea4738 100644
--- a/src/COLVARS/fix_colvars.cpp
+++ b/src/COLVARS/fix_colvars.cpp
@@ -725,7 +725,8 @@ void FixColvars::end_of_step()
     if (comm->me == 0) {
       // store old force data
       std::vector<cvm::rvector> &of = *(proxy->modify_atom_total_forces());
-      for (i=0; i<num_coords; ++i) {
+
+      for (i = 0; i < num_coords; ++i) {
         const tagint k = atom->map(taglist[i]);
         if ((k >= 0) && (k < nlocal)) {
           auto search = idmap.find(tag[k]);
@@ -738,14 +739,15 @@ void FixColvars::end_of_step()
         }
       }
       /* loop over procs to receive remote data */
-      for (i=1; i < comm->nprocs; ++i) {
+      for (i = 1; i < comm->nprocs; ++i) {
         int maxbuf = nmax*size_one;
         MPI_Irecv(comm_buf, maxbuf, MPI_BYTE, i, 0, world, &request);
         MPI_Send(&tmp, 0, MPI_INT, i, 0, world);
         MPI_Wait(&request, &status);
         MPI_Get_count(&status, MPI_BYTE, &ndata);
         ndata /= size_one;
-        for (int k=0; k<ndata; ++k) {
+
+        for (int k = 0; k < ndata; ++k) {
           auto search = idmap.find(comm_buf[k].tag);
           if (search != idmap.end()) {
             const int j = search->second;
@@ -758,8 +760,8 @@ void FixColvars::end_of_step()
     } else { // me != 0
       /* copy total force data into communication buffer */
       nme = 0;
-      for (i=0; i<num_coords; ++i) {
-        const tagint k = atom->map(taglist[i]);
+      for (i = 0; i < num_coords; ++i) {
+        const auto k = atom->map(taglist[i]);
         if ((k >= 0) && (k < nlocal)) {
           comm_buf[nme].tag  = tag[k];
           comm_buf[nme].x    = f[k][0];
diff --git a/src/DIELECTRIC/fix_polarize_bem_gmres.cpp b/src/DIELECTRIC/fix_polarize_bem_gmres.cpp
index 588a8b8eabf..a883a06ea1b 100644
--- a/src/DIELECTRIC/fix_polarize_bem_gmres.cpp
+++ b/src/DIELECTRIC/fix_polarize_bem_gmres.cpp
@@ -350,8 +350,10 @@ void FixPolarizeBEMGMRES::compute_induced_charges()
   double *em = atom->em;
   double *epsilon = atom->epsilon;
   int nlocal = atom->nlocal;
-  int eflag = 1;
-  int vflag = 0;
+
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   // compute the right hand side (vector b) of Eq. (40) according to Eq. (42)
   // keep the scaled real charges intact here to compute efield for the right hand side (b)
diff --git a/src/DIELECTRIC/fix_polarize_bem_icc.cpp b/src/DIELECTRIC/fix_polarize_bem_icc.cpp
index 85459df8638..a6c8f2fdf66 100644
--- a/src/DIELECTRIC/fix_polarize_bem_icc.cpp
+++ b/src/DIELECTRIC/fix_polarize_bem_icc.cpp
@@ -255,10 +255,12 @@ void FixPolarizeBEMICC::compute_induced_charges()
   double *epsilon = atom->epsilon;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
-  int eflag = 1;
-  int vflag = 0;
   int itr;
 
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
+
   // use Eq. (64) in Barros et al. to initialize the induced charges
   // Note: area[i] is included here to ensure correct charge unit
   //   for direct use in force/efield compute
diff --git a/src/EXTRA-FIX/fix_numdiff.cpp b/src/EXTRA-FIX/fix_numdiff.cpp
index 87bd456181a..fbebe159e61 100644
--- a/src/EXTRA-FIX/fix_numdiff.cpp
+++ b/src/EXTRA-FIX/fix_numdiff.cpp
@@ -296,18 +296,20 @@ double FixNumDiff::update_energy()
 {
   force_clear(atom->f);
 
-  int eflag = 1;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
-  if (pair_compute_flag) force->pair->compute(eflag, 0);
+  if (pair_compute_flag) force->pair->compute(eflag, vflag);
 
   if (atom->molecular != Atom::ATOMIC) {
-    if (force->bond) force->bond->compute(eflag, 0);
-    if (force->angle) force->angle->compute(eflag, 0);
-    if (force->dihedral) force->dihedral->compute(eflag, 0);
-    if (force->improper) force->improper->compute(eflag, 0);
+    if (force->bond) force->bond->compute(eflag, vflag);
+    if (force->angle) force->angle->compute(eflag, vflag);
+    if (force->dihedral) force->dihedral->compute(eflag, vflag);
+    if (force->improper) force->improper->compute(eflag, vflag);
   }
 
-  if (kspace_compute_flag) force->kspace->compute(eflag, 0);
+  if (kspace_compute_flag) force->kspace->compute(eflag, vflag);
 
   double energy = pe->compute_scalar();
   return energy;
diff --git a/src/EXTRA-FIX/fix_numdiff_virial.cpp b/src/EXTRA-FIX/fix_numdiff_virial.cpp
index 826d1975c08..beff4b616d2 100644
--- a/src/EXTRA-FIX/fix_numdiff_virial.cpp
+++ b/src/EXTRA-FIX/fix_numdiff_virial.cpp
@@ -272,18 +272,20 @@ void FixNumDiffVirial::restore_atoms(int nall, int idir)
 
 double FixNumDiffVirial::update_energy()
 {
-  int eflag = 1;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
-  if (pair_compute_flag) force->pair->compute(eflag, 0);
+  if (pair_compute_flag) force->pair->compute(eflag, vflag);
 
   if (atom->molecular != Atom::ATOMIC) {
-    if (force->bond) force->bond->compute(eflag, 0);
-    if (force->angle) force->angle->compute(eflag, 0);
-    if (force->dihedral) force->dihedral->compute(eflag, 0);
-    if (force->improper) force->improper->compute(eflag, 0);
+    if (force->bond) force->bond->compute(eflag, vflag);
+    if (force->angle) force->angle->compute(eflag, vflag);
+    if (force->dihedral) force->dihedral->compute(eflag, vflag);
+    if (force->improper) force->improper->compute(eflag, vflag);
   }
 
-  if (kspace_compute_flag) force->kspace->compute(eflag, 0);
+  if (kspace_compute_flag) force->kspace->compute(eflag, vflag);
 
   double energy = pe->compute_scalar();
   return energy;
diff --git a/src/EXTRA-MOLECULE/dihedral_fourier.cpp b/src/EXTRA-MOLECULE/dihedral_fourier.cpp
index f76bae3b016..73b901c68f5 100644
--- a/src/EXTRA-MOLECULE/dihedral_fourier.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_fourier.cpp
@@ -28,6 +28,7 @@
 #include "neighbor.h"
 
 #include <cmath>
+#include <cstring>
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
@@ -53,17 +54,17 @@ DihedralFourier::~DihedralFourier()
     memory->destroy(nterms);
 
     for (int i=1; i<= atom->ndihedraltypes; i++) {
-      delete [] k[i];
-      delete [] multiplicity[i];
-      delete [] shift[i];
-      delete [] cos_shift[i];
-      delete [] sin_shift[i];
+      delete[] k[i];
+      delete[] multiplicity[i];
+      delete[] shift[i];
+      delete[] cos_shift[i];
+      delete[] sin_shift[i];
     }
-    delete [] k;
-    delete [] multiplicity;
-    delete [] shift;
-    delete [] cos_shift;
-    delete [] sin_shift;
+    delete[] k;
+    delete[] multiplicity;
+    delete[] shift;
+    delete[] cos_shift;
+    delete[] sin_shift;
   }
 }
 
@@ -256,11 +257,13 @@ void DihedralFourier::allocate()
   int n = atom->ndihedraltypes;
 
   memory->create(nterms,n+1,"dihedral:nterms");
-  k = new double * [n+1];
-  multiplicity = new int * [n+1];
-  shift = new double * [n+1];
-  cos_shift = new double * [n+1];
-  sin_shift = new double * [n+1];
+  memset(nterms,0,sizeof(int)*(n+1));
+
+  k = new double *[n+1];
+  multiplicity = new int *[n+1];
+  shift = new double *[n+1];
+  cos_shift = new double *[n+1];
+  sin_shift = new double *[n+1];
   for (int i = 1; i <= n; i++) {
     k[i] = shift[i] = cos_shift[i] = sin_shift[i] = nullptr;
     multiplicity[i] = nullptr;
@@ -290,6 +293,7 @@ void DihedralFourier::coeff(int narg, char **arg)
   int multiplicity_one;
   double shift_one;
   int nterms_one = utils::inumeric(FLERR,arg[1],false,lmp);
+  nterms_max = MAX(nterms_max,nterms_one);
 
   if (nterms_one < 1)
     error->all(FLERR,"Incorrect number of terms arg for dihedral coefficients");
@@ -300,7 +304,6 @@ void DihedralFourier::coeff(int narg, char **arg)
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     nterms[i] = nterms_one;
-    nterms_max = MAX(nterms_max,nterms_one);
     delete[] k[i];
     delete[] multiplicity[i];
     delete[] shift[i];
@@ -335,13 +338,16 @@ void DihedralFourier::coeff(int narg, char **arg)
 
 void DihedralFourier::write_restart(FILE *fp)
 {
+  // must store nterms_max in restart file in addition to the nterms array
+  // the KOKKOS version requires it to store the coefficients in a 2d view
+  fwrite(&nterms_max,sizeof(int),1,fp);
   fwrite(&nterms[1],sizeof(int),atom->ndihedraltypes,fp);
+
   for (int i = 1; i <= atom->ndihedraltypes; i++) {
     fwrite(k[i],sizeof(double),nterms[i],fp);
     fwrite(multiplicity[i],sizeof(int),nterms[i],fp);
     fwrite(shift[i],sizeof(double),nterms[i],fp);
   }
-
 }
 
 /* ----------------------------------------------------------------------
@@ -352,18 +358,20 @@ void DihedralFourier::read_restart(FILE *fp)
 {
   allocate();
 
-  if (comm->me == 0)
+  if (comm->me == 0) {
+    utils::sfread(FLERR,&nterms_max,sizeof(int),1,fp,nullptr,error);
     utils::sfread(FLERR,&nterms[1],sizeof(int),atom->ndihedraltypes,fp,nullptr,error);
-
+  }
+  MPI_Bcast(&nterms_max,1,MPI_INT,0,world);
   MPI_Bcast(&nterms[1],atom->ndihedraltypes,MPI_INT,0,world);
 
   // allocate
-  for (int i=1; i<=atom->ndihedraltypes; i++) {
-    k[i] = new double [nterms[i]];
-    multiplicity[i] = new int [nterms[i]];
-    shift[i] = new double [nterms[i]];
-    cos_shift[i] = new double [nterms[i]];
-    sin_shift[i] = new double [nterms[i]];
+  for (int i = 1; i <= atom->ndihedraltypes; i++) {
+    k[i] = new double[nterms[i]];
+    multiplicity[i] = new int[nterms[i]];
+    shift[i] = new double[nterms[i]];
+    cos_shift[i] = new double[nterms[i]];
+    sin_shift[i] = new double[nterms[i]];
   }
 
   if (comm->me == 0) {
diff --git a/src/EXTRA-MOLECULE/dihedral_nharmonic.cpp b/src/EXTRA-MOLECULE/dihedral_nharmonic.cpp
index a88593dd20c..05dc5fa0255 100644
--- a/src/EXTRA-MOLECULE/dihedral_nharmonic.cpp
+++ b/src/EXTRA-MOLECULE/dihedral_nharmonic.cpp
@@ -40,18 +40,21 @@ DihedralNHarmonic::DihedralNHarmonic(LAMMPS *lmp) : Dihedral(lmp)
   writedata = 1;
   a = nullptr;
   born_matrix_enable = 1;
+  nterms_max = 0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 DihedralNHarmonic::~DihedralNHarmonic()
 {
+  if (copymode) return;
+
   if (allocated) {
     memory->destroy(setflag);
     for (int i = 1; i <= atom->ndihedraltypes; i++)
-      delete [] a[i];
-    delete [] a;
-    delete [] nterms;
+      delete[] a[i];
+    delete[] a;
+    delete[] nterms;
   }
 }
 
@@ -259,8 +262,9 @@ void DihedralNHarmonic::coeff(int narg, char **arg)
 {
   if (narg < 3) error->all(FLERR,"Incorrect args for dihedral coefficients" + utils::errorurl(21));
 
-  int n = utils::inumeric(FLERR,arg[1],false,lmp);
-  if (narg != n + 2)
+  int nterms_one = utils::inumeric(FLERR,arg[1],false,lmp);
+  nterms_max = MAX(nterms_max,nterms_one);
+  if (narg != nterms_one + 2)
     error->all(FLERR,"Incorrect args for dihedral coefficients" + utils::errorurl(21));
 
   if (!allocated) allocate();
@@ -271,9 +275,9 @@ void DihedralNHarmonic::coeff(int narg, char **arg)
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     delete[] a[i];
-    a[i] = new double [n];
-    nterms[i] = n;
-    for (int j = 0; j < n; j++) {
+    a[i] = new double [nterms_one];
+    nterms[i] = nterms_one;
+    for (int j = 0; j < nterms_one; j++) {
       a[i][j] = utils::numeric(FLERR,arg[2+j],false,lmp);
       setflag[i] = 1;
     }
@@ -289,7 +293,11 @@ void DihedralNHarmonic::coeff(int narg, char **arg)
 
 void DihedralNHarmonic::write_restart(FILE *fp)
 {
+  // must store nterms_max in restart file in addition to the nterms array
+  // the KOKKOS version requires it to store the coefficients in a 2d view
+  fwrite(&nterms_max,sizeof(int),1,fp);
   fwrite(&nterms[1],sizeof(int),atom->ndihedraltypes,fp);
+
   for (int i = 1; i <= atom->ndihedraltypes; i++)
     fwrite(a[i],sizeof(double),nterms[i],fp);
 }
@@ -302,9 +310,11 @@ void DihedralNHarmonic::read_restart(FILE *fp)
 {
   allocate();
 
-  if (comm->me == 0)
+  if (comm->me == 0) {
+    utils::sfread(FLERR,&nterms_max,sizeof(int),1,fp,nullptr,error);
     utils::sfread(FLERR,&nterms[1],sizeof(int),atom->ndihedraltypes,fp,nullptr,error);
-
+  }
+  MPI_Bcast(&nterms_max,1,MPI_INT,0,world);
   MPI_Bcast(&nterms[1],atom->ndihedraltypes,MPI_INT,0,world);
 
   // allocate
diff --git a/src/EXTRA-MOLECULE/dihedral_nharmonic.h b/src/EXTRA-MOLECULE/dihedral_nharmonic.h
index 738e98f0720..9b53d1e0dd3 100644
--- a/src/EXTRA-MOLECULE/dihedral_nharmonic.h
+++ b/src/EXTRA-MOLECULE/dihedral_nharmonic.h
@@ -38,6 +38,7 @@ class DihedralNHarmonic : public Dihedral {
  protected:
   int *nterms;
   double **a;
+  int nterms_max;
 
   void allocate();
 };
diff --git a/src/EXTRA-PAIR/pair_lj_cut_sphere.cpp b/src/EXTRA-PAIR/pair_lj_cut_sphere.cpp
index f36ec89cb72..aabd1b256ba 100644
--- a/src/EXTRA-PAIR/pair_lj_cut_sphere.cpp
+++ b/src/EXTRA-PAIR/pair_lj_cut_sphere.cpp
@@ -360,7 +360,7 @@ double PairLJCutSphere::single(int i, int j, int itype, int jtype, double rsq,
   sigma6 = powint(sigma, 6);
   r2inv = 1.0 / rsq;
   r6inv = r2inv * r2inv * r2inv;
-  forcelj = r6inv * 24.0 * epsilon[itype][jtype] * (sigma6 * sigma6 * r6inv - sigma6);
+  forcelj = r6inv * 24.0 * epsilon[itype][jtype] * (2.0 * sigma6 * sigma6 * r6inv - sigma6);
   fforce = factor_lj * forcelj * r2inv;
 
   philj = r6inv * 4.0 * epsilon[itype][jtype] * (sigma6 * sigma6 * r6inv - sigma6);
diff --git a/src/FEP/compute_fep.cpp b/src/FEP/compute_fep.cpp
index 5ade0b4e61f..c444e8352a8 100644
--- a/src/FEP/compute_fep.cpp
+++ b/src/FEP/compute_fep.cpp
@@ -268,8 +268,9 @@ void ComputeFEP::compute_vector()
 {
   double pe0, pe1;
 
-  eflag = 1;
-  vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   invoked_vector = update->ntimestep;
 
diff --git a/src/FEP/compute_fep.h b/src/FEP/compute_fep.h
index ef25cb2ccb2..af56a0eb155 100644
--- a/src/FEP/compute_fep.h
+++ b/src/FEP/compute_fep.h
@@ -41,7 +41,6 @@ class ComputeFEP : public Compute {
   int chgflag;
   int tailflag, volumeflag;
   int fepinitflag;
-  int eflag, vflag;
   double temp_fep;
 
   int nmax;
diff --git a/src/FEP/compute_fep_ta.cpp b/src/FEP/compute_fep_ta.cpp
index 786bf53bfa9..32f397b3261 100644
--- a/src/FEP/compute_fep_ta.cpp
+++ b/src/FEP/compute_fep_ta.cpp
@@ -149,8 +149,9 @@ void ComputeFEPTA::compute_vector()
 {
   double pe0, pe1;
 
-  eflag = 1;
-  vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   invoked_vector = update->ntimestep;
 
diff --git a/src/FEP/compute_fep_ta.h b/src/FEP/compute_fep_ta.h
index 5b5891660e4..de876d7b628 100644
--- a/src/FEP/compute_fep_ta.h
+++ b/src/FEP/compute_fep_ta.h
@@ -38,7 +38,6 @@ class ComputeFEPTA : public Compute {
  private:
   int tailflag;
   int fepinitflag;
-  int eflag, vflag;
   double temp_fep;
   double scale_factor;
   int tan_axis1, tan_axis2, norm_axis;
diff --git a/src/GRAPHICS/image_objects.cpp b/src/GRAPHICS/image_objects.cpp
index ee012a92fb1..8a5006c1c45 100644
--- a/src/GRAPHICS/image_objects.cpp
+++ b/src/GRAPHICS/image_objects.cpp
@@ -75,6 +75,32 @@ inline double superscale(const double *shape, const double *block, const vec3 &p
   return pow(pow(a, block[0] / block[1]) + b, -1.0 / block[0]);
 }
 
+// compute surface normal for a superellipsoid at a point on the unit sphere.
+// The superellipsoid surface is: ((|x/a|^e + |y/b|^e)^(n/e) + |z/c|^n)^(1/n) = 1
+// where n = block[0], e = block[1], a = shape[0], b = shape[1], c = shape[2].
+// The normal is proportional to the gradient of the implicit function:
+//   nx = u^(n/e - 1) * |x/a|^(e-1) * sign(x) / a
+//   ny = u^(n/e - 1) * |y/b|^(e-1) * sign(y) / b
+//   nz = |z/c|^(n-1) * sign(z) / c
+// where u = |x/a|^e + |y/b|^e.
+
+inline vec3 supernormal(const double *shape, const double *block, const vec3 &pos)
+{
+  const double n = block[0], e = block[1];
+  const double xa = fabs(pos[0] / shape[0]);
+  const double yb = fabs(pos[1] / shape[1]);
+  const double zc = fabs(pos[2] / shape[2]);
+
+  const double u = pow(xa, e) + pow(yb, e);
+  const double ufactor = (u > 0.0) ? pow(u, n / e - 1.0) : 0.0;
+
+  double nx = (xa > 0.0) ? ufactor * pow(xa, e - 1.0) * copysign(1.0, pos[0]) / shape[0] : 0.0;
+  double ny = (yb > 0.0) ? ufactor * pow(yb, e - 1.0) * copysign(1.0, pos[1]) / shape[1] : 0.0;
+  double nz = (zc > 0.0) ? pow(zc, n - 1.0) * copysign(1.0, pos[2]) / shape[2] : 0.0;
+
+  return vec3norm({nx, ny, nz});
+}
+
 // re-orient list of triangles to point along "dir", then scale and translate it.
 std::vector<triangle> transform(const std::vector<triangle> &triangles, const vec3 &dir,
                                 const vec3 &offs, double len, double width)
@@ -492,6 +518,62 @@ EllipsoidObj::EllipsoidObj(int level)
 
   // refine the list of triangles to the desired level
   for (int i = 1; i < level; ++i) refine();
+
+  // Rotate the sphere mesh so that the Cartesian axes point through (or near)
+  // triangle face centers rather than through vertices or edges.  This improves
+  // the visual quality of ellipsoids and superellipsoids by making them appear
+  // less "pointy" along the three principal axes, especially with lower
+  // triangle counts.  The default orientation has the principal axes pass
+  // through corners or edges.  We prefer smooth geometries for simulations
+  // anyway and thus the rotation allows to get a better approximation from the
+  // triangulation with lower refinement levels and thus require less
+  // computational effort for creating an acceptable representation.  The
+  // rotation is constructed by finding the face centers closest to the +x and
+  // +y axes and building an orthonormal basis from them via Gram-Schmidt. This
+  // yields a rotation around a tilted axis that moves vertices off all three
+  // coordinate axes simultaneously.
+
+  if (!triangles.empty()) {
+
+    // Find the face centers (normalized to unit sphere) closest to +x and +y
+    const vec3 ax = {1.0, 0.0, 0.0};
+    const vec3 ay = {0.0, 1.0, 0.0};
+    double best_dx = -1.0, best_dy = -1.0;
+    vec3 cx = ax, cy = ay;
+
+    for (const auto &tri : triangles) {
+      vec3 c = vec3norm(tri[0] + tri[1] + tri[2]);
+      double dx = vec3dot(c, ax);
+      double dy = vec3dot(c, ay);
+      if (dx > best_dx) { best_dx = dx; cx = c; }
+      if (dy > best_dy) { best_dy = dy; cy = c; }
+    }
+
+    // Build orthonormal frame {e1, e2, e3} from the two face center directions
+    // using Gram-Schmidt orthogonalization.  The resulting rotation matrix has
+    // e1, e2, e3 as its rows so that e1 maps to +x, e2 maps to +y, and
+    // e3 = e1 x e2 maps to +z.
+    vec3 e1 = cx;
+    vec3 e2 = vec3norm(cy - vec3dot(cy, e1) * e1);
+    vec3 e3 = vec3cross(e1, e2);
+
+    // clang-format off
+    double R[3][3] = {{e1[0], e1[1], e1[2]},
+                      {e2[0], e2[1], e2[2]},
+                      {e3[0], e3[1], e3[2]}};
+    // clang-format on
+
+    // Apply rotation to all triangle vertices
+    for (auto &tri : triangles) {
+      for (auto &v : tri) {
+        vec3 rv;
+        rv[0] = R[0][0] * v[0] + R[0][1] * v[1] + R[0][2] * v[2];
+        rv[1] = R[1][0] * v[0] + R[1][1] * v[1] + R[1][2] * v[2];
+        rv[2] = R[2][0] * v[0] + R[2][1] * v[1] + R[2][2] * v[2];
+        v = rv;
+      }
+    }
+  }
 }
 
 // draw method for drawing ellipsoids from a region which has its own transformation function
@@ -594,36 +676,47 @@ void EllipsoidObj::draw(Image *img, int flag, const double *color, const double
   // draw triangles and edges as requested, work on copy of triangle since we modify it
   for (auto tri : triangles) {
 
+    // compute surface normals from unit sphere coordinates before scaling
+    vec3 n1, n2, n3;
     if (dotri) {
-      // compute ellipsoid surface normals from gradient of x^2/a^2 + y^2/b^2 + z^2/c^2
-      const double sa = shape[0] * shape[0], sb = shape[1] * shape[1], sc = shape[2] * shape[2];
-      vec3 n1 = vec3norm({tri[0][0] / sa, tri[0][1] / sb, tri[0][2] / sc});
-      vec3 n2 = vec3norm({tri[1][0] / sa, tri[1][1] / sb, tri[1][2] / sc});
-      vec3 n3 = vec3norm({tri[2][0] / sa, tri[2][1] / sb, tri[2][2] / sc});
-
-      // set shape by shifting each corner to the surface
       if (block) {
-        for (int i = 0; i < 3; ++i) {
-          auto &t = tri[i];
-          t = superscale(shape, block, t) * t;
-        }
+        // compute superellipsoid surface normals from gradient of implicit function
+        n1 = supernormal(shape, block, tri[0]);
+        n2 = supernormal(shape, block, tri[1]);
+        n3 = supernormal(shape, block, tri[2]);
       } else {
-        for (int i = 0; i < 3; ++i) {
-          auto &t = tri[i];
-          t = radscale(shape, t) * t;
-        }
+        // compute ellipsoid surface normals from gradient of x^2/a^2 + y^2/b^2 + z^2/c^2
+        const double sa = shape[0] * shape[0], sb = shape[1] * shape[1], sc = shape[2] * shape[2];
+        n1 = vec3norm({tri[0][0] / sa, tri[0][1] / sb, tri[0][2] / sc});
+        n2 = vec3norm({tri[1][0] / sa, tri[1][1] / sb, tri[1][2] / sc});
+        n3 = vec3norm({tri[2][0] / sa, tri[2][1] / sb, tri[2][2] / sc});
+      }
+    }
+
+    // set shape by shifting each corner to the surface
+    if (block) {
+      for (int i = 0; i < 3; ++i) {
+        auto &t = tri[i];
+        t = superscale(shape, block, t) * t;
+      }
+    } else {
+      for (int i = 0; i < 3; ++i) {
+        auto &t = tri[i];
+        t = radscale(shape, t) * t;
       }
+    }
 
-      // rotate
-      MathExtra::matvec(p, tri[0].data(), e1.data());
-      MathExtra::matvec(p, tri[1].data(), e2.data());
-      MathExtra::matvec(p, tri[2].data(), e3.data());
+    // rotate
+    MathExtra::matvec(p, tri[0].data(), e1.data());
+    MathExtra::matvec(p, tri[1].data(), e2.data());
+    MathExtra::matvec(p, tri[2].data(), e3.data());
 
-      // translate
-      e1 = e1 + offs;
-      e2 = e2 + offs;
-      e3 = e3 + offs;
+    // translate
+    e1 = e1 + offs;
+    e2 = e2 + offs;
+    e3 = e3 + offs;
 
+    if (dotri) {
       // rotate normals (no translation or scaling)
       vec3 rn1, rn2, rn3;
       MathExtra::matvec(p, n1.data(), rn1.data());
@@ -635,28 +728,6 @@ void EllipsoidObj::draw(Image *img, int flag, const double *color, const double
     }
 
     if (doframe) {
-      // set shape
-      if (block) {
-        for (int i = 0; i < 3; ++i) {
-          auto &t = tri[i];
-          t = superscale(shape, block, t) * t;
-        }
-      } else {
-        for (int i = 0; i < 3; ++i) {
-          auto &t = tri[i];
-          t = radscale(shape, t) * t;
-        }
-      }
-
-      // rotate
-      MathExtra::matvec(p, tri[0].data(), e1.data());
-      MathExtra::matvec(p, tri[1].data(), e2.data());
-      MathExtra::matvec(p, tri[2].data(), e3.data());
-
-      // translate
-      e1 = e1 + offs;
-      e2 = e2 + offs;
-      e3 = e3 + offs;
       img->draw_cylinder(e1.data(), e2.data(), color, diameter, 3, opacity);
       img->draw_cylinder(e2.data(), e3.data(), color, diameter, 3, opacity);
       img->draw_cylinder(e3.data(), e1.data(), color, diameter, 3, opacity);
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 424751a7c35..2f554b9abe8 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -87,6 +87,10 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
   max_buf_fix = 0;
   k_buf_send_fix = DAT::tdual_double_1d("comm:k_buf_send_fix",1);
   k_buf_recv_fix = DAT::tdual_double_1d("comm:k_recv_send_fix",1);
+
+  max_buf_compute = 0;
+  k_buf_send_compute = DAT::tdual_double_1d("comm:k_buf_send_compute",1);
+  k_buf_recv_compute = DAT::tdual_double_1d("comm:k_recv_send_compute",1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -117,6 +121,7 @@ void CommKokkos::init()
   forward_pair_comm_legacy = lmp->kokkos->forward_pair_comm_legacy;
   reverse_pair_comm_legacy = lmp->kokkos->reverse_pair_comm_legacy;
   forward_fix_comm_legacy = lmp->kokkos->forward_fix_comm_legacy;
+  forward_compute_comm_legacy = lmp->kokkos->forward_compute_comm_legacy;
   reverse_comm_legacy = lmp->kokkos->reverse_comm_legacy;
   reverse_fix_comm_legacy = lmp->kokkos->reverse_fix_comm_legacy;
   exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
@@ -555,8 +560,86 @@ void CommKokkos::reverse_comm_variable(Fix *fix)
 
 void CommKokkos::forward_comm(Compute *compute, int size)
 {
-  k_sendlist.sync_host();
-  CommBrick::forward_comm(compute, size);
+  if (compute->execution_space == Host || compute->execution_space == HostKK ||
+      !compute->forward_comm_device || forward_compute_comm_legacy) {
+    k_sendlist.sync_host();
+    CommBrick::forward_comm(compute, size);
+  } else {
+    k_sendlist.sync_device();
+    forward_comm_device<LMPDeviceType>(compute, size);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void CommKokkos::forward_comm_device(Compute *compute, int size)
+{
+  int iswap,n,nsize;
+  MPI_Request request;
+  DAT::tdual_double_1d k_buf_tmp;
+
+  if (size) nsize = size;
+  else nsize = compute->comm_forward;
+  KokkosBase* computeKKBase = dynamic_cast<KokkosBase*>(compute);
+
+  for (iswap = 0; iswap < nswap; iswap++) {
+    int n = MAX(max_buf_compute,nsize*sendnum[iswap]);
+    n = MAX(n,nsize*recvnum[iswap]);
+    if (n > max_buf_compute)
+      grow_buf_compute(n);
+  }
+
+  for (iswap = 0; iswap < nswap; iswap++) {
+
+    // pack buffer
+
+    auto k_sendlist_iswap = Kokkos::subview(k_sendlist,iswap,Kokkos::ALL);
+    n = computeKKBase->pack_forward_comm_kokkos(sendnum[iswap],k_sendlist_iswap,
+                                      k_buf_send_compute,pbc_flag[iswap],pbc[iswap]);
+
+    // exchange with another proc
+    // if self, set recv buffer to send buffer
+
+    if (sendproc[iswap] != me) {
+      double* buf_send_compute;
+      double* buf_recv_compute;
+      if (lmp->kokkos->gpu_aware_flag) {
+        buf_send_compute = k_buf_send_compute.view<DeviceType>().data();
+        buf_recv_compute = k_buf_recv_compute.view<DeviceType>().data();
+      } else {
+        k_buf_send_compute.modify<DeviceType>();
+        k_buf_send_compute.sync_host();
+        buf_send_compute = k_buf_send_compute.view_host().data();
+        buf_recv_compute = k_buf_recv_compute.view_host().data();
+      }
+
+      if (recvnum[iswap]) {
+        DeviceType().fence();
+        MPI_Irecv(buf_recv_compute,nsize*recvnum[iswap],MPI_DOUBLE,
+                  recvproc[iswap],0,world,&request);
+      }
+      if (sendnum[iswap]) {
+        DeviceType().fence();
+        MPI_Send(buf_send_compute,n,MPI_DOUBLE,sendproc[iswap],0,world);
+      }
+
+      if (recvnum[iswap]) {
+        MPI_Wait(&request,MPI_STATUS_IGNORE);
+        DeviceType().fence();
+      }
+
+      if (!lmp->kokkos->gpu_aware_flag) {
+        k_buf_recv_compute.modify_host();
+        k_buf_recv_compute.sync<DeviceType>();
+      }
+      k_buf_tmp = k_buf_recv_compute;
+    } else k_buf_tmp = k_buf_send_compute;
+
+    // unpack buffer
+
+    computeKKBase->unpack_forward_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_tmp);
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -714,6 +797,15 @@ void CommKokkos::grow_buf_fix(int n) {
   k_buf_recv_fix.resize(max_buf_fix);
 }
 
+/* ---------------------------------------------------------------------- */
+
+void CommKokkos::grow_buf_compute(int n) {
+  max_buf_compute = n * BUFFACTOR;
+  k_buf_send_compute.resize(max_buf_compute);
+  k_buf_recv_compute.resize(max_buf_compute);
+}
+
+
 /* ---------------------------------------------------------------------- */
 
 void CommKokkos::reverse_comm(Pair *pair, int size)
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
index 875ed290d8a..17dbb76ebca 100644
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@@ -29,6 +29,7 @@ class CommKokkos : public CommBrick {
   bool forward_pair_comm_legacy;
   bool reverse_pair_comm_legacy;
   bool forward_fix_comm_legacy;
+  bool forward_compute_comm_legacy;
   bool reverse_comm_legacy;
   bool reverse_fix_comm_legacy;
   bool exchange_comm_on_host;
@@ -66,6 +67,7 @@ class CommKokkos : public CommBrick {
   template<class DeviceType> void reverse_comm_device(Pair *pair, int size=0);
   template<class DeviceType> void forward_comm_device(Fix *fix, int size=0);
   template<class DeviceType> void reverse_comm_device(Fix *fix, int size=0);
+  template<class DeviceType> void forward_comm_device(Compute *compute, int size=0);
   template<class DeviceType> void exchange_device();
   template<class DeviceType> void borders_device();
 
@@ -86,11 +88,12 @@ class CommKokkos : public CommBrick {
   DAT::tdual_int_1d k_sendnum_scan;
   int totalsend;
 
-  int max_buf_pair,max_buf_fix;
-  DAT::tdual_double_1d k_buf_send_pair, k_buf_send_fix;
-  DAT::tdual_double_1d k_buf_recv_pair, k_buf_recv_fix;
+  int max_buf_pair,max_buf_fix,max_buf_compute;
+  DAT::tdual_double_1d k_buf_send_pair, k_buf_send_fix, k_buf_send_compute;
+  DAT::tdual_double_1d k_buf_recv_pair, k_buf_recv_fix, k_buf_recv_compute;
   void grow_buf_pair(int);
   void grow_buf_fix(int);
+  void grow_buf_compute(int);
 
   void grow_send(int, int) override;
   void grow_recv(int) override;
diff --git a/src/KOKKOS/comm_tiled_kokkos.cpp b/src/KOKKOS/comm_tiled_kokkos.cpp
index 3d448b00b60..32d151b65cd 100644
--- a/src/KOKKOS/comm_tiled_kokkos.cpp
+++ b/src/KOKKOS/comm_tiled_kokkos.cpp
@@ -618,7 +618,6 @@ void CommTiledKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
                         atomKK->avecKK->size_border + atomKK->avecKK->size_velocity);
     else
       k_buf_send.resize(maxsend_border,atomKK->avecKK->size_border);
-    buf_send = k_buf_send.view_host().data();
   } else {
     if (ghost_velocity)
       MemoryKokkos::realloc_kokkos(k_buf_send,"comm:k_buf_send",maxsend_border,
@@ -626,8 +625,8 @@ void CommTiledKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
     else
       MemoryKokkos::realloc_kokkos(k_buf_send,"comm:k_buf_send",maxsend_border,
                         atomKK->avecKK->size_border);
-    buf_send = k_buf_send.view_host().data();
   }
+  buf_send = k_buf_send.view_host().data();
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp b/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp
index 85d5b419958..4b420c000bb 100644
--- a/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp
+++ b/src/KOKKOS/compute_ave_sphere_atom_kokkos.cpp
@@ -22,6 +22,7 @@
 #include "comm.h"
 #include "domain.h"
 #include "force.h"
+#include "kokkos.h"
 #include "memory_kokkos.h"
 #include "neigh_request.h"
 #include "neighbor_kokkos.h"
@@ -36,6 +37,8 @@ ComputeAveSphereAtomKokkos<DeviceType>::ComputeAveSphereAtomKokkos(LAMMPS *lmp,
   ComputeAveSphereAtom(lmp, narg, arg)
 {
   kokkosable = 1;
+  forward_comm_device = 1;
+
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = EMPTY_MASK;
@@ -72,6 +75,9 @@ void ComputeAveSphereAtomKokkos<DeviceType>::init()
 template<class DeviceType>
 void ComputeAveSphereAtomKokkos<DeviceType>::compute_peratom()
 {
+  int prev_auto_sync = lmp->kokkos->auto_sync;
+  lmp->kokkos->auto_sync = 0;
+
   invoked_peratom = update->ntimestep;
 
   // grow result array if necessary
@@ -86,9 +92,7 @@ void ComputeAveSphereAtomKokkos<DeviceType>::compute_peratom()
 
   // need velocities of ghost atoms
 
-  atomKK->sync(Host,V_MASK);
   comm->forward_comm(this);
-  atomKK->modified(Host,V_MASK);
 
   // invoke full neighbor list (will copy or build if necessary)
 
@@ -125,8 +129,13 @@ void ComputeAveSphereAtomKokkos<DeviceType>::compute_peratom()
 
   k_result.modify<DeviceType>();
   k_result.sync_host();
+  atomKK->k_v.clear_sync_state();
+
+  lmp->kokkos->auto_sync = prev_auto_sync;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 // NOLINTNEXTLINE
 KOKKOS_INLINE_FUNCTION
@@ -209,6 +218,91 @@ void ComputeAveSphereAtomKokkos<DeviceType>::operator()(TagComputeAveSphereAtom,
   }
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int ComputeAveSphereAtomKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist,
+                                                         DAT::tdual_double_1d &k_buf,
+                                                         int pbc_flag, int* pbc)
+{
+  d_sendlist = k_sendlist.view<DeviceType>();
+  d_buf = k_buf.view<DeviceType>();
+
+  atomKK->sync(execution_space,V_MASK);
+  v = atomKK->k_v.view<DeviceType>();
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagComputeAveSphereAtomPackForwardComm>(0,n),*this);
+  copymode = 0;
+
+  return n*3;
+}
+
+template<class DeviceType>
+// NOLINTNEXTLINE
+KOKKOS_INLINE_FUNCTION
+void ComputeAveSphereAtomKokkos<DeviceType>::operator()(TagComputeAveSphereAtomPackForwardComm, const int &i) const {
+  const int j = d_sendlist(i);
+
+  d_buf[3*i] = static_cast<double>(v(j,0));
+  d_buf[3*i+1] = static_cast<double>(v(j,1));
+  d_buf[3*i+2] = static_cast<double>(v(j,2));
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeAveSphereAtomKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_double_1d &buf)
+{
+  first = first_in;
+  d_buf = buf.view<DeviceType>();
+
+  atomKK->sync(execution_space,V_MASK);
+  v = atomKK->k_v.view<DeviceType>();
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagComputeAveSphereAtomUnpackForwardComm>(0,n),*this);
+  copymode = 0;
+
+  atomKK->modified(execution_space,V_MASK);
+}
+
+template<class DeviceType>
+// NOLINTNEXTLINE
+KOKKOS_INLINE_FUNCTION
+void ComputeAveSphereAtomKokkos<DeviceType>::operator()(TagComputeAveSphereAtomUnpackForwardComm, const int &i) const {
+  v(i + first,0) = static_cast<KK_FLOAT>(d_buf[3*i]);
+  v(i + first,1) = static_cast<KK_FLOAT>(d_buf[3*i+1]);
+  v(i + first,2) = static_cast<KK_FLOAT>(d_buf[3*i+2]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int ComputeAveSphereAtomKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
+                                int pbc_flag, int *pbc)
+{
+  atomKK->sync(Host,V_MASK);
+
+  int m = ComputeAveSphereAtom::pack_forward_comm(n,list,buf,pbc_flag,pbc);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void ComputeAveSphereAtomKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  atomKK->sync(Host,V_MASK);
+
+  ComputeAveSphereAtom::unpack_forward_comm(n,first,buf);
+
+  atomKK->modified(Host,V_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class ComputeAveSphereAtomKokkos<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
diff --git a/src/KOKKOS/compute_ave_sphere_atom_kokkos.h b/src/KOKKOS/compute_ave_sphere_atom_kokkos.h
index 049e93e66c3..cb5279b4b40 100644
--- a/src/KOKKOS/compute_ave_sphere_atom_kokkos.h
+++ b/src/KOKKOS/compute_ave_sphere_atom_kokkos.h
@@ -25,14 +25,17 @@ ComputeStyle(ave/sphere/atom/kk/host,ComputeAveSphereAtomKokkos<LMPHostType>);
 
 #include "compute_ave_sphere_atom.h"
 #include "kokkos_type.h"
+#include "kokkos_base.h"
 
 namespace LAMMPS_NS {
 
 // clang-format off
 struct TagComputeAveSphereAtom {};
+struct TagComputeAveSphereAtomPackForwardComm{};
+struct TagComputeAveSphereAtomUnpackForwardComm{};
 // clang-format on
 
-template <class DeviceType> class ComputeAveSphereAtomKokkos : public ComputeAveSphereAtom {
+template <class DeviceType> class ComputeAveSphereAtomKokkos : public ComputeAveSphereAtom, public KokkosBase {
  public:
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
@@ -42,10 +45,24 @@ template <class DeviceType> class ComputeAveSphereAtomKokkos : public ComputeAve
   void init() override;
   void compute_peratom() override;
 
+  int pack_forward_comm_kokkos(int, DAT::tdual_int_1d, DAT::tdual_double_1d&,
+                       int, int *) override;
+  void unpack_forward_comm_kokkos(int, int, DAT::tdual_double_1d&) override;
+  int pack_forward_comm(int, int *, double *, int, int *) override;
+  void unpack_forward_comm(int, int, double *) override;
+
 // NOLINTNEXTLINE
   KOKKOS_INLINE_FUNCTION
   void operator()(TagComputeAveSphereAtom, const int &) const;
 
+// NOLINTNEXTLINE
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeAveSphereAtomPackForwardComm, const int&) const;
+
+// NOLINTNEXTLINE
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeAveSphereAtomUnpackForwardComm, const int&) const;
+
  private:
   KK_FLOAT adof, mvv2e, mv2d, boltz;
 
@@ -62,6 +79,11 @@ template <class DeviceType> class ComputeAveSphereAtomKokkos : public ComputeAve
 
   DAT::ttransform_kkfloat_2d k_result;
   typename AT::t_kkfloat_2d d_result;
+
+  int first,nsend;
+
+  typename AT::t_int_1d d_sendlist;
+  typename AT::t_double_1d_um d_buf;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/KOKKOS/dihedral_nharmonic_kokkos.cpp b/src/KOKKOS/dihedral_nharmonic_kokkos.cpp
new file mode 100644
index 00000000000..2de03c673ef
--- /dev/null
+++ b/src/KOKKOS/dihedral_nharmonic_kokkos.cpp
@@ -0,0 +1,543 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer using Claude Opus 4.6
+   [ based on dihedral_multi_harmonic_kokkos.cpp and dihedral_fourier_kokkos.cpp]
+------------------------------------------------------------------------- */
+
+#include "dihedral_nharmonic_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory_kokkos.h"
+#include "neighbor_kokkos.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+static constexpr double TOLERANCE = 0.05;
+static constexpr double SMALL = 0.001;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+DihedralNHarmonicKokkos<DeviceType>::DihedralNHarmonicKokkos(LAMMPS *lmp) : DihedralNHarmonic(lmp)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  neighborKK = (NeighborKokkos *) neighbor;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | Q_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  k_warning_flag = DAT::tdual_int_scalar("Dihedral:warning_flag");
+  d_warning_flag = k_warning_flag.view<DeviceType>();
+  h_warning_flag = k_warning_flag.view_host();
+
+  centroidstressflag = CENTROID_NOTAVAIL;
+
+  allocated_kokkos = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+DihedralNHarmonicKokkos<DeviceType>::~DihedralNHarmonicKokkos()
+{
+  if (!copymode) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralNHarmonicKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    if ((int)k_eatom.extent(0) < maxeatom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"dihedral:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+    } else Kokkos::deep_copy(d_eatom,0.0);
+  }
+  if (vflag_atom) {
+    if ((int)k_vatom.extent(0) < maxvatom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"dihedral:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+    } else Kokkos::deep_copy(d_vatom,0.0);
+  }
+
+  k_a.template sync<DeviceType>();
+  k_nterms.template sync<DeviceType>();
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  neighborKK->k_dihedrallist.template sync<DeviceType>();
+  dihedrallist = neighborKK->k_dihedrallist.view<DeviceType>();
+  int ndihedrallist = neighborKK->ndihedrallist;
+  nlocal = atom->nlocal;
+  newton_bond = force->newton_bond;
+
+  h_warning_flag() = 0;
+  k_warning_flag.modify_host();
+  k_warning_flag.template sync<DeviceType>();
+
+  copymode = 1;
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+  if (evflag) {
+    if (newton_bond) {
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagDihedralNHarmonicCompute<1,1> >(0,ndihedrallist),*this,ev);
+    } else {
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagDihedralNHarmonicCompute<0,1> >(0,ndihedrallist),*this,ev);
+    }
+  } else {
+    if (newton_bond) {
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagDihedralNHarmonicCompute<1,0> >(0,ndihedrallist),*this);
+    } else {
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagDihedralNHarmonicCompute<0,0> >(0,ndihedrallist),*this);
+    }
+  }
+
+  // error check
+
+  k_warning_flag.template modify<DeviceType>();
+  k_warning_flag.sync_host();
+  if (h_warning_flag())
+    error->warning(FLERR,"Dihedral problem");
+
+  if (eflag_global) energy += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.sync_host();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.sync_host();
+  }
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+template<int NEWTON_BOND, int EVFLAG>
+// NOLINTNEXTLINE
+KOKKOS_INLINE_FUNCTION
+void DihedralNHarmonicKokkos<DeviceType>::operator()(TagDihedralNHarmonicCompute<NEWTON_BOND,EVFLAG>, const int &n, EV_FLOAT& ev) const {
+
+  // The f array is atomic
+  Kokkos::View<KK_ACC_FLOAT*[3], typename DAT::t_kkacc_1d_3::array_layout,typename KKDevice<DeviceType>::value,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > a_f = f;
+
+  const int i1 = dihedrallist(n,0);
+  const int i2 = dihedrallist(n,1);
+  const int i3 = dihedrallist(n,2);
+  const int i4 = dihedrallist(n,3);
+  const int type = dihedrallist(n,4);
+
+  // 1st bond
+
+  const KK_FLOAT vb1x = x(i1,0) - x(i2,0);
+  const KK_FLOAT vb1y = x(i1,1) - x(i2,1);
+  const KK_FLOAT vb1z = x(i1,2) - x(i2,2);
+
+  // 2nd bond
+
+  const KK_FLOAT vb2x = x(i3,0) - x(i2,0);
+  const KK_FLOAT vb2y = x(i3,1) - x(i2,1);
+  const KK_FLOAT vb2z = x(i3,2) - x(i2,2);
+
+  const KK_FLOAT vb2xm = -vb2x;
+  const KK_FLOAT vb2ym = -vb2y;
+  const KK_FLOAT vb2zm = -vb2z;
+
+  // 3rd bond
+
+  const KK_FLOAT vb3x = x(i4,0) - x(i3,0);
+  const KK_FLOAT vb3y = x(i4,1) - x(i3,1);
+  const KK_FLOAT vb3z = x(i4,2) - x(i3,2);
+
+  // c0 calculation
+
+  const KK_FLOAT sb1 = 1.0 / (vb1x * vb1x + vb1y * vb1y + vb1z * vb1z);
+  const KK_FLOAT sb2 = 1.0 / (vb2x * vb2x + vb2y * vb2y + vb2z * vb2z);
+  const KK_FLOAT sb3 = 1.0 / (vb3x * vb3x + vb3y * vb3y + vb3z * vb3z);
+
+  const KK_FLOAT rb1 = sqrt(sb1);
+  const KK_FLOAT rb3 = sqrt(sb3);
+
+  KK_FLOAT c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * rb1 * rb3;
+
+  // 1st and 2nd angle
+
+  KK_FLOAT b1mag2 = vb1x * vb1x + vb1y * vb1y + vb1z * vb1z;
+  KK_FLOAT b1mag = sqrt(b1mag2);
+  KK_FLOAT b2mag2 = vb2x * vb2x + vb2y * vb2y + vb2z * vb2z;
+  KK_FLOAT b2mag = sqrt(b2mag2);
+  KK_FLOAT b3mag2 = vb3x * vb3x + vb3y * vb3y + vb3z * vb3z;
+  KK_FLOAT b3mag = sqrt(b3mag2);
+
+  KK_FLOAT ctmp = vb1x * vb2x + vb1y * vb2y + vb1z * vb2z;
+  KK_FLOAT r12c1 = 1.0 / (b1mag * b2mag);
+  KK_FLOAT c1mag = ctmp * r12c1;
+
+  ctmp = vb2xm * vb3x + vb2ym * vb3y + vb2zm * vb3z;
+  KK_FLOAT r12c2 = 1.0 / (b2mag * b3mag);
+  KK_FLOAT c2mag = ctmp * r12c2;
+
+  // cos and sin of 2 angles and final c
+
+  KK_FLOAT sin2 = MAX(1.0 - c1mag * c1mag, 0.0);
+  KK_FLOAT sc1 = sqrt(sin2);
+  if (sc1 < SMALL) sc1 = SMALL;
+  sc1 = 1.0 / sc1;
+
+  sin2 = MAX(1.0 - c2mag * c2mag, 0.0);
+  KK_FLOAT sc2 = sqrt(sin2);
+  if (sc2 < SMALL) sc2 = SMALL;
+  sc2 = 1.0 / sc2;
+
+  KK_FLOAT s1 = sc1 * sc1;
+  KK_FLOAT s2 = sc2 * sc2;
+  KK_FLOAT s12 = sc1 * sc2;
+  KK_FLOAT c = (c0 + c1mag * c2mag) * s12;
+
+  // error check
+
+  if ((c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) && !d_warning_flag())
+    d_warning_flag() = 1;
+
+  if (c > 1.0) c = 1.0;
+  if (c < -1.0) c = -1.0;
+
+  // force & energy
+  // p = sum (i=1,n) a_i * c**(i-1)
+  // pd = dp/dc
+
+  const int nt = d_nterms[type];
+  KK_FLOAT c_ = 1.0;
+  KK_FLOAT p = d_a(type, 0);
+  KK_FLOAT pd = 0.0;
+  for (int i = 1; i < nt; i++) {
+    pd += c_ * i * d_a(type, i);
+    c_ *= c;
+    p += c_ * d_a(type, i);
+  }
+
+  KK_FLOAT edihedral = 0.0;
+  if (eflag) edihedral = p;
+
+  c = c * pd;
+  s12 = s12 * pd;
+  const KK_FLOAT a11 = c * sb1 * s1;
+  const KK_FLOAT a22 = -sb2 * (2.0 * c0 * s12 - c * (s1 + s2));
+  const KK_FLOAT a33 = c * sb3 * s2;
+  const KK_FLOAT a12 = -r12c1 * (c1mag * c * s1 + c2mag * s12);
+  const KK_FLOAT a13 = -rb1 * rb3 * s12;
+  const KK_FLOAT a23 = r12c2 * (c2mag * c * s2 + c1mag * s12);
+
+  const KK_FLOAT sx2 = a12 * vb1x + a22 * vb2x + a23 * vb3x;
+  const KK_FLOAT sy2 = a12 * vb1y + a22 * vb2y + a23 * vb3y;
+  const KK_FLOAT sz2 = a12 * vb1z + a22 * vb2z + a23 * vb3z;
+
+  KK_FLOAT f1[3],f2[3],f3[3],f4[3];
+  f1[0] = a11 * vb1x + a12 * vb2x + a13 * vb3x;
+  f1[1] = a11 * vb1y + a12 * vb2y + a13 * vb3y;
+  f1[2] = a11 * vb1z + a12 * vb2z + a13 * vb3z;
+
+  f2[0] = -sx2 - f1[0];
+  f2[1] = -sy2 - f1[1];
+  f2[2] = -sz2 - f1[2];
+
+  f4[0] = a13 * vb1x + a23 * vb2x + a33 * vb3x;
+  f4[1] = a13 * vb1y + a23 * vb2y + a33 * vb3y;
+  f4[2] = a13 * vb1z + a23 * vb2z + a33 * vb3z;
+
+  f3[0] = sx2 - f4[0];
+  f3[1] = sy2 - f4[1];
+  f3[2] = sz2 - f4[2];
+
+  // apply force to each of 4 atoms
+
+  if (NEWTON_BOND || i1 < nlocal) {
+    a_f(i1,0) += f1[0];
+    a_f(i1,1) += f1[1];
+    a_f(i1,2) += f1[2];
+  }
+
+  if (NEWTON_BOND || i2 < nlocal) {
+    a_f(i2,0) += f2[0];
+    a_f(i2,1) += f2[1];
+    a_f(i2,2) += f2[2];
+  }
+
+  if (NEWTON_BOND || i3 < nlocal) {
+    a_f(i3,0) += f3[0];
+    a_f(i3,1) += f3[1];
+    a_f(i3,2) += f3[2];
+  }
+
+  if (NEWTON_BOND || i4 < nlocal) {
+    a_f(i4,0) += f4[0];
+    a_f(i4,1) += f4[1];
+    a_f(i4,2) += f4[2];
+  }
+
+  if (EVFLAG)
+    ev_tally(ev,i1,i2,i3,i4,edihedral,f1,f3,f4,
+             vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z);
+}
+
+template<class DeviceType>
+template<int NEWTON_BOND, int EVFLAG>
+// NOLINTNEXTLINE
+KOKKOS_INLINE_FUNCTION
+void DihedralNHarmonicKokkos<DeviceType>::operator()(TagDihedralNHarmonicCompute<NEWTON_BOND,EVFLAG>, const int &n) const {
+  EV_FLOAT ev;
+  this->template operator()<NEWTON_BOND,EVFLAG>(TagDihedralNHarmonicCompute<NEWTON_BOND,EVFLAG>(), n, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralNHarmonicKokkos<DeviceType>::allocate_kokkos()
+{
+  int n = atom->ndihedraltypes;
+
+  if (!allocated_kokkos) {
+    k_a = DAT::tdual_kkfloat_2d("DihedralNHarmonic::a",n+1,nterms_max);
+    k_nterms = DAT::tdual_int_1d("DihedralNHarmonic::nterms",n+1);
+  } else {
+    k_a.resize(n+1,nterms_max);
+    k_nterms.resize(n+1);
+  }
+
+  d_a = k_a.template view<DeviceType>();
+  d_nterms = k_nterms.template view<DeviceType>();
+
+  allocated_kokkos = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one type
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralNHarmonicKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  DihedralNHarmonic::coeff(narg, arg);
+  allocate_kokkos();
+
+  int ilo,ihi;
+  utils::bounds(FLERR,arg[0],1,atom->ndihedraltypes,ilo,ihi,error);
+
+  for (int i = ilo; i <= ihi; i++) {
+    k_nterms.view_host()[i] = nterms[i];
+    for (int j = 0; j < nterms[i]; j++)
+      k_a.view_host()(i,j) = a[i][j];
+  }
+
+  k_a.modify_host();
+  k_nterms.modify_host();
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads coeffs from restart file, bcasts them
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void DihedralNHarmonicKokkos<DeviceType>::read_restart(FILE *fp)
+{
+  DihedralNHarmonic::read_restart(fp);
+  allocate_kokkos();
+
+  int n = atom->ndihedraltypes;
+  for (int i = 1; i <= n; i++) {
+    k_nterms.view_host()[i] = nterms[i];
+    for (int j = 0; j < nterms[i]; j++)
+      k_a.view_host()(i,j) = a[i][j];
+  }
+
+  k_a.modify_host();
+  k_nterms.modify_host();
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+   virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
+          = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
+          = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+//template<int NEWTON_BOND>
+// NOLINTNEXTLINE
+KOKKOS_INLINE_FUNCTION
+void DihedralNHarmonicKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int i1, const int i2, const int i3, const int i4,
+                        KK_FLOAT &edihedral, KK_FLOAT *f1, KK_FLOAT *f3, KK_FLOAT *f4,
+                        const KK_FLOAT &vb1x, const KK_FLOAT &vb1y, const KK_FLOAT &vb1z,
+                        const KK_FLOAT &vb2x, const KK_FLOAT &vb2y, const KK_FLOAT &vb2z,
+                        const KK_FLOAT &vb3x, const KK_FLOAT &vb3y, const KK_FLOAT &vb3z) const
+{
+  KK_FLOAT edihedralquarter;
+  KK_FLOAT v[6];
+
+  // The eatom and vatom arrays are atomic
+  Kokkos::View<KK_ACC_FLOAT*, typename DAT::t_kkacc_1d::array_layout,typename KKDevice<DeviceType>::value,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > v_eatom = d_eatom;
+  Kokkos::View<KK_ACC_FLOAT*[6], typename DAT::t_kkacc_1d_6::array_layout,typename KKDevice<DeviceType>::value,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > v_vatom = d_vatom;
+
+  if (eflag_either) {
+    if (eflag_global) {
+      if (newton_bond) ev.evdwl += edihedral;
+      else {
+        edihedralquarter = 0.25*edihedral;
+        if (i1 < nlocal) ev.evdwl += edihedralquarter;
+        if (i2 < nlocal) ev.evdwl += edihedralquarter;
+        if (i3 < nlocal) ev.evdwl += edihedralquarter;
+        if (i4 < nlocal) ev.evdwl += edihedralquarter;
+      }
+    }
+    if (eflag_atom) {
+      edihedralquarter = 0.25*edihedral;
+      if (newton_bond || i1 < nlocal) v_eatom[i1] += edihedralquarter;
+      if (newton_bond || i2 < nlocal) v_eatom[i2] += edihedralquarter;
+      if (newton_bond || i3 < nlocal) v_eatom[i3] += edihedralquarter;
+      if (newton_bond || i4 < nlocal) v_eatom[i4] += edihedralquarter;
+    }
+  }
+
+  if (vflag_either) {
+    v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
+    v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
+    v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
+    v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
+    v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
+    v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
+
+    if (vflag_global) {
+      if (newton_bond) {
+        ev.v[0] += v[0];
+        ev.v[1] += v[1];
+        ev.v[2] += v[2];
+        ev.v[3] += v[3];
+        ev.v[4] += v[4];
+        ev.v[5] += v[5];
+      } else {
+        if (i1 < nlocal) {
+          ev.v[0] += 0.25*v[0];
+          ev.v[1] += 0.25*v[1];
+          ev.v[2] += 0.25*v[2];
+          ev.v[3] += 0.25*v[3];
+          ev.v[4] += 0.25*v[4];
+          ev.v[5] += 0.25*v[5];
+        }
+        if (i2 < nlocal) {
+          ev.v[0] += 0.25*v[0];
+          ev.v[1] += 0.25*v[1];
+          ev.v[2] += 0.25*v[2];
+          ev.v[3] += 0.25*v[3];
+          ev.v[4] += 0.25*v[4];
+          ev.v[5] += 0.25*v[5];
+        }
+        if (i3 < nlocal) {
+          ev.v[0] += 0.25*v[0];
+          ev.v[1] += 0.25*v[1];
+          ev.v[2] += 0.25*v[2];
+          ev.v[3] += 0.25*v[3];
+          ev.v[4] += 0.25*v[4];
+          ev.v[5] += 0.25*v[5];
+        }
+        if (i4 < nlocal) {
+          ev.v[0] += 0.25*v[0];
+          ev.v[1] += 0.25*v[1];
+          ev.v[2] += 0.25*v[2];
+          ev.v[3] += 0.25*v[3];
+          ev.v[4] += 0.25*v[4];
+          ev.v[5] += 0.25*v[5];
+        }
+      }
+    }
+
+    if (vflag_atom) {
+      if (newton_bond || i1 < nlocal) {
+        v_vatom(i1,0) += 0.25*v[0];
+        v_vatom(i1,1) += 0.25*v[1];
+        v_vatom(i1,2) += 0.25*v[2];
+        v_vatom(i1,3) += 0.25*v[3];
+        v_vatom(i1,4) += 0.25*v[4];
+        v_vatom(i1,5) += 0.25*v[5];
+      }
+      if (newton_bond || i2 < nlocal) {
+        v_vatom(i2,0) += 0.25*v[0];
+        v_vatom(i2,1) += 0.25*v[1];
+        v_vatom(i2,2) += 0.25*v[2];
+        v_vatom(i2,3) += 0.25*v[3];
+        v_vatom(i2,4) += 0.25*v[4];
+        v_vatom(i2,5) += 0.25*v[5];
+      }
+      if (newton_bond || i3 < nlocal) {
+        v_vatom(i3,0) += 0.25*v[0];
+        v_vatom(i3,1) += 0.25*v[1];
+        v_vatom(i3,2) += 0.25*v[2];
+        v_vatom(i3,3) += 0.25*v[3];
+        v_vatom(i3,4) += 0.25*v[4];
+        v_vatom(i3,5) += 0.25*v[5];
+      }
+      if (newton_bond || i4 < nlocal) {
+        v_vatom(i4,0) += 0.25*v[0];
+        v_vatom(i4,1) += 0.25*v[1];
+        v_vatom(i4,2) += 0.25*v[2];
+        v_vatom(i4,3) += 0.25*v[3];
+        v_vatom(i4,4) += 0.25*v[4];
+        v_vatom(i4,5) += 0.25*v[5];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class DihedralNHarmonicKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class DihedralNHarmonicKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/dihedral_nharmonic_kokkos.h b/src/KOKKOS/dihedral_nharmonic_kokkos.h
new file mode 100644
index 00000000000..7c06fdc2d44
--- /dev/null
+++ b/src/KOKKOS/dihedral_nharmonic_kokkos.h
@@ -0,0 +1,99 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+// clang-format off
+DihedralStyle(nharmonic/kk,DihedralNHarmonicKokkos<LMPDeviceType>);
+DihedralStyle(nharmonic/kk/device,DihedralNHarmonicKokkos<LMPDeviceType>);
+DihedralStyle(nharmonic/kk/host,DihedralNHarmonicKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_DIHEDRAL_NHARMONIC_KOKKOS_H
+#define LMP_DIHEDRAL_NHARMONIC_KOKKOS_H
+
+#include "dihedral_nharmonic.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<int NEWTON_BOND, int EVFLAG>
+struct TagDihedralNHarmonicCompute{};
+
+template<class DeviceType>
+class DihedralNHarmonicKokkos : public DihedralNHarmonic {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  DihedralNHarmonicKokkos(class LAMMPS *);
+  ~DihedralNHarmonicKokkos() override;
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void read_restart(FILE *) override;
+
+  template<int NEWTON_BOND, int EVFLAG>
+// NOLINTNEXTLINE
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagDihedralNHarmonicCompute<NEWTON_BOND,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEWTON_BOND, int EVFLAG>
+// NOLINTNEXTLINE
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagDihedralNHarmonicCompute<NEWTON_BOND,EVFLAG>, const int&) const;
+
+  //template<int NEWTON_BOND>
+// NOLINTNEXTLINE
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int i1, const int i2, const int i3, const int i4,
+                          KK_FLOAT &edihedral, KK_FLOAT *f1, KK_FLOAT *f3, KK_FLOAT *f4,
+                          const KK_FLOAT &vb1x, const KK_FLOAT &vb1y, const KK_FLOAT &vb1z,
+                          const KK_FLOAT &vb2x, const KK_FLOAT &vb2y, const KK_FLOAT &vb2z,
+                          const KK_FLOAT &vb3x, const KK_FLOAT &vb3y, const KK_FLOAT &vb3z) const;
+
+  DAT::ttransform_kkacc_1d k_eatom;
+  DAT::ttransform_kkacc_1d_6 k_vatom;
+
+ protected:
+  int allocated_kokkos;
+
+  class NeighborKokkos *neighborKK;
+
+  typename AT::t_kkfloat_1d_3_lr_randomread x;
+  typename AT::t_kkacc_1d_3 f;
+  typename AT::t_int_2d_lr dihedrallist;
+  typename AT::t_kkacc_1d d_eatom;
+  typename AT::t_kkacc_1d_6 d_vatom;
+
+  int nlocal,newton_bond;
+  int eflag,vflag;
+
+  DAT::tdual_int_scalar k_warning_flag;
+  typename AT::t_int_scalar d_warning_flag;
+  HAT::t_int_scalar h_warning_flag;
+
+  DAT::tdual_kkfloat_2d k_a;
+  DAT::tdual_int_1d k_nterms;
+
+  typename AT::t_kkfloat_2d d_a;
+  typename AT::t_int_1d d_nterms;
+
+  void allocate_kokkos();
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 87db605acc9..9c657b88064 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -65,11 +65,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
   exchange_comm_changed = 0;
   forward_comm_changed = 0;
+  reverse_comm_changed = 0;
   forward_pair_comm_changed = 0;
   reverse_pair_comm_changed = 0;
   forward_fix_comm_changed = 0;
   reverse_fix_comm_changed = 0;
-  reverse_comm_changed = 0;
+  forward_compute_comm_changed = 0;
   sort_changed = atom_map_changed = 0;
 
   delete memory;
@@ -289,7 +290,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
     exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0;
     forward_pair_comm_legacy = reverse_pair_comm_legacy =
-      forward_fix_comm_legacy = reverse_fix_comm_legacy = 0;
+      forward_fix_comm_legacy = reverse_fix_comm_legacy =
+      forward_compute_comm_legacy = 0;
     sort_legacy = 0;
     atom_map_legacy = 0;
 
@@ -306,7 +308,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
     exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1;
     forward_pair_comm_legacy = reverse_pair_comm_legacy =
-      forward_fix_comm_legacy = reverse_fix_comm_legacy = 1;
+      forward_fix_comm_legacy = reverse_fix_comm_legacy =
+      forward_compute_comm_legacy = 0;
     sort_legacy = 1;
     atom_map_legacy = 1;
 
@@ -492,19 +495,22 @@ void KokkosLMP::accelerator(int narg, char **arg)
       if (strcmp(arg[iarg+1],"no") == 0) {
         exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1;
         forward_pair_comm_legacy = reverse_pair_comm_legacy =
-          forward_fix_comm_legacy = reverse_fix_comm_legacy = 1;
+          forward_fix_comm_legacy = reverse_fix_comm_legacy =
+          forward_compute_comm_legacy = 0;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
         exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0;
         forward_pair_comm_legacy = reverse_pair_comm_legacy =
-          forward_fix_comm_legacy = reverse_fix_comm_legacy = 1;
+          forward_fix_comm_legacy = reverse_fix_comm_legacy =
+          forward_compute_comm_legacy = 0;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
       } else if (strcmp(arg[iarg+1],"device") == 0) {
         exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0;
         forward_pair_comm_legacy = reverse_pair_comm_legacy =
-          forward_fix_comm_legacy = reverse_fix_comm_legacy = 0;
+          forward_fix_comm_legacy = reverse_fix_comm_legacy =
+          forward_compute_comm_legacy = 0;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
@@ -565,6 +571,14 @@ void KokkosLMP::accelerator(int narg, char **arg)
       else error->all(FLERR,"Illegal package kokkos command");
       reverse_fix_comm_changed = 0;
       iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/compute/forward") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) forward_compute_comm_legacy = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) forward_compute_comm_legacy = 1;
+      else if (strcmp(arg[iarg+1],"device") == 0) forward_compute_comm_legacy = 0;
+      else error->all(FLERR,"Illegal package kokkos command");
+      forward_compute_comm_changed = 0;
+      iarg += 2;
     } else if (strcmp(arg[iarg],"comm/reverse") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       else if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_legacy = 1;
@@ -671,6 +685,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
       reverse_fix_comm_legacy = 1;
       reverse_fix_comm_changed = 1;
     }
+    if (forward_compute_comm_legacy == 0) {
+      forward_compute_comm_legacy = 1;
+      forward_compute_comm_changed = 1;
+    }
     if (reverse_comm_legacy == 0 && reverse_comm_on_host == 0) {
       reverse_comm_legacy = 1;
       reverse_comm_changed = 1;
@@ -715,6 +733,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
       reverse_fix_comm_legacy = 0;
       reverse_fix_comm_changed = 0;
     }
+    if (forward_compute_comm_changed) {
+      forward_compute_comm_legacy = 0;
+      forward_compute_comm_changed = 0;
+    }
     if (reverse_comm_changed) {
       reverse_comm_legacy = 0;
       reverse_comm_changed = 0;
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 4e3af0bdbcd..6dcf0091374 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -29,11 +29,12 @@ class KokkosLMP : protected Pointers {
   int neighflag_qeq_set;
   int exchange_comm_legacy;
   int forward_comm_legacy;
+  int reverse_comm_legacy;
   int forward_pair_comm_legacy;
   int reverse_pair_comm_legacy;
   int forward_fix_comm_legacy;
   int reverse_fix_comm_legacy;
-  int reverse_comm_legacy;
+  int forward_compute_comm_legacy;
   int sort_legacy;
   int atom_map_legacy;
   int exchange_comm_on_host;
@@ -41,11 +42,12 @@ class KokkosLMP : protected Pointers {
   int reverse_comm_on_host;
   int exchange_comm_changed;
   int forward_comm_changed;
+  int reverse_comm_changed;
   int forward_pair_comm_changed;
   int reverse_pair_comm_changed;
   int forward_fix_comm_changed;
   int reverse_fix_comm_changed;
-  int reverse_comm_changed;
+  int forward_compute_comm_changed;
   int sort_changed;
   int atom_map_changed;
   int nthreads,ngpus;
diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h
index 6b07a7fd2bc..2cbd1ec0003 100644
--- a/src/KOKKOS/kokkos_base.h
+++ b/src/KOKKOS/kokkos_base.h
@@ -25,7 +25,7 @@ class KokkosBase {
  public:
   KokkosBase() {}
 
-  // Pair
+  // Forward for Pair, Fix, Compute
   virtual int pack_forward_comm_kokkos(int, DAT::tdual_int_1d,
                                        DAT::tdual_double_1d &,
                                        int, int *) {return 0;};
@@ -35,18 +35,7 @@ class KokkosBase {
   virtual void unpack_reverse_comm_kokkos(int, DAT::tdual_int_1d,
                                           DAT::tdual_double_1d &) {}
 
-  // Fix
-  virtual int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_1d,
-                                           DAT::tdual_double_1d &,
-                                           int, int *) {return 0;};
-  virtual void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {}
-
-
-  virtual int pack_reverse_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {return 0;};
-  virtual void unpack_reverse_comm_fix_kokkos(int, DAT::tdual_int_1d,
-                                          int, DAT::tdual_double_1d &) {}
-
-
+  // Exchange
   virtual int pack_exchange_kokkos(const int & /*nsend*/, DAT::tdual_double_2d_lr & /*k_buf*/,
                                    DAT::tdual_int_1d /*k_sendlist*/,
                                    DAT::tdual_int_1d /*k_copylist*/,
diff --git a/src/MC/fix_atom_swap.cpp b/src/MC/fix_atom_swap.cpp
index 98af643aa0f..66b414c07bc 100644
--- a/src/MC/fix_atom_swap.cpp
+++ b/src/MC/fix_atom_swap.cpp
@@ -662,8 +662,9 @@ int FixAtomSwap::attempt_swap()
 
 double FixAtomSwap::energy_full()
 {
-  int eflag = 1;
-  int vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   if (modify->n_pre_force) modify->pre_force(vflag);
 
diff --git a/src/MC/fix_charge_regulation.cpp b/src/MC/fix_charge_regulation.cpp
index fa1e6e35b04..32c281f45c0 100644
--- a/src/MC/fix_charge_regulation.cpp
+++ b/src/MC/fix_charge_regulation.cpp
@@ -1125,6 +1125,11 @@ int FixChargeRegulation::get_random_particle(int ptype, double charge, double rd
 /* ---------------------------------------------------------------------- */
 
 double FixChargeRegulation::energy_full() {
+
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
+
   if (triclinic) domain->x2lamda(atom->nlocal);
   domain->pbc();
   comm->exchange();
@@ -1133,8 +1138,7 @@ double FixChargeRegulation::energy_full() {
   if (triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
   if (modify->n_pre_neighbor) modify->pre_neighbor();
   neighbor->build(1);
-  int eflag = 1;
-  int vflag = 0;
+
   if (overlap_flag) {
     int overlaptestall;
     int overlaptest = 0;
diff --git a/src/MC/fix_gcmc.cpp b/src/MC/fix_gcmc.cpp
index a66a461ca1b..16e5234e0b4 100644
--- a/src/MC/fix_gcmc.cpp
+++ b/src/MC/fix_gcmc.cpp
@@ -2329,8 +2329,10 @@ double FixGCMC::energy_full()
   if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
   if (modify->n_pre_neighbor) modify->pre_neighbor();
   neighbor->build(1);
-  int eflag = 1;
-  int vflag = 0;
+
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   // if overlap check requested, if overlap,
   // return signal value for energy
diff --git a/src/MC/fix_mol_swap.cpp b/src/MC/fix_mol_swap.cpp
index 94e1d0e1959..898bca59ed6 100644
--- a/src/MC/fix_mol_swap.cpp
+++ b/src/MC/fix_mol_swap.cpp
@@ -429,8 +429,9 @@ int FixMolSwap::attempt_swap()
 
 double FixMolSwap::energy_full()
 {
-  int eflag = 1;
-  int vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   if (modify->n_pre_force) modify->pre_force(vflag);
 
diff --git a/src/MC/fix_neighbor_swap.cpp b/src/MC/fix_neighbor_swap.cpp
index 97f113b33bf..5ca266f8aa4 100644
--- a/src/MC/fix_neighbor_swap.cpp
+++ b/src/MC/fix_neighbor_swap.cpp
@@ -632,8 +632,9 @@ int FixNeighborSwap::attempt_swap()
 
 double FixNeighborSwap::energy_full()
 {
-  int eflag = 1;
-  int vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   if (modify->n_pre_force) modify->pre_force(vflag);
 
diff --git a/src/MC/fix_sgcmc.cpp b/src/MC/fix_sgcmc.cpp
index 66a810e88a0..9b06f77608d 100644
--- a/src/MC/fix_sgcmc.cpp
+++ b/src/MC/fix_sgcmc.cpp
@@ -896,8 +896,9 @@ double FixSemiGrandCanonicalMC::computeEnergyChangeGeneric(int flipAtom, int old
  *********************************************************************/
 double FixSemiGrandCanonicalMC::computeTotalEnergy()
 {
-  int eflag = 1;
-  int vflag = 0;
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   if (force->pair) force->pair->compute(eflag,vflag);
 
diff --git a/src/MC/fix_widom.cpp b/src/MC/fix_widom.cpp
index 6862ff1feba..c8311079b54 100644
--- a/src/MC/fix_widom.cpp
+++ b/src/MC/fix_widom.cpp
@@ -1055,8 +1055,10 @@ double FixWidom::energy_full()
   if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
   if (modify->n_pre_neighbor) modify->pre_neighbor();
   neighbor->build(1);
-  int eflag = 1;
-  int vflag = 0;
+
+  // flag that we only need to compute the global energy
+  int eflag = ENERGY_GLOBAL | ENERGY_ONLY;
+  int vflag = VIRIAL_NONE;
 
   // clear forces so they don't accumulate over multiple
   // calls within fix widom timestep
diff --git a/src/OPENMP/reaxff_torsion_angles_omp.cpp b/src/OPENMP/reaxff_torsion_angles_omp.cpp
index 945a670a512..8a36ec0ff66 100644
--- a/src/OPENMP/reaxff_torsion_angles_omp.cpp
+++ b/src/OPENMP/reaxff_torsion_angles_omp.cpp
@@ -62,7 +62,6 @@ namespace ReaxFF {
       int type_i, type_j, type_k, type_l;
       int start_j, end_j;
       int start_pj, end_pj, start_pk, end_pk;
-      int num_frb_intrs = 0;
 
       double Delta_j, Delta_k;
       double r_ij, r_jk, r_kl, r_li;
@@ -200,12 +199,7 @@ namespace ReaxFF {
 
                     if (i != l && fbh->cnt &&
                         bo_kl->BO > control->thb_cut/*0*/ &&
-                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut/*0*/) {
-                      ++num_frb_intrs;
-                      //fprintf(stderr,
-                      //      "%5d: %6d %6d %6d %6d\n", num_frb_intrs,
-                      //      system->my_atoms[i].orig_id,system->my_atoms[j].orig_id,
-                      //      system->my_atoms[k].orig_id,system->my_atoms[l].orig_id);
+                        bo_ij->BO * bo_jk->BO * bo_kl->BO > control->thb_cut) {
 
                       r_kl = pbond_kl->d;
                       BOA_kl = bo_kl->BO - control->thb_cut;
@@ -213,7 +207,6 @@ namespace ReaxFF {
                       theta_jkl = p_jkl->theta;
                       sin_jkl = sin(theta_jkl);
                       cos_jkl = cos(theta_jkl);
-                      //tan_jkl_i = 1. / tan(theta_jkl);
                       if (sin_jkl >= 0 && sin_jkl <= MIN_SINE)
                         tan_jkl_i = cos_jkl / MIN_SINE;
                       else if (sin_jkl <= 0 && sin_jkl >= -MIN_SINE)
diff --git a/src/angle.cpp b/src/angle.cpp
index 168b8fe8067..a7a15a2df15 100644
--- a/src/angle.cpp
+++ b/src/angle.cpp
@@ -97,14 +97,15 @@ void Angle::settings(int narg, char **args)
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
      evflag       != 0 if any bits of eflag or vflag are set
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag set
-     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag set
-     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag set
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag != CENTROID_AVAIL
-     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag set
+     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag = CENTROID_AVAIL
      vflag_either != 0 if any of vflag_global, vflag_atom, cvflag_atom is set
 ------------------------------------------------------------------------- */
@@ -115,9 +116,10 @@ void Angle::ev_setup(int eflag, int vflag, int alloc)
 
   evflag = 1;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
   vflag_atom = vflag & VIRIAL_ATOM;
diff --git a/src/angle.h b/src/angle.h
index 83ba6a236bc..93f8148cb3a 100644
--- a/src/angle.h
+++ b/src/angle.h
@@ -74,7 +74,7 @@ class Angle : protected Pointers {
   int suffix_flag;    // suffix compatibility flag
 
   int evflag;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom, cvflag_atom;
   int maxeatom, maxvatom, maxcvatom;
 
@@ -83,8 +83,8 @@ class Angle : protected Pointers {
     if (eflag || vflag)
       ev_setup(eflag, vflag, alloc);
     else
-      evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          cvflag_atom = 0;
+      evflag = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either = vflag_global =
+          vflag_atom = cvflag_atom = 0;
   }
   void ev_setup(int, int, int alloc = 1);
   void ev_tally(int, int, int, int, int, double, double *, double *, double, double, double, double,
diff --git a/src/atom_map.cpp b/src/atom_map.cpp
index 37b46182c14..17c9237fa9e 100644
--- a/src/atom_map.cpp
+++ b/src/atom_map.cpp
@@ -335,8 +335,8 @@ void Atom::map_delete()
     map_array = nullptr;
   } else {
     if (map_nhash) {
-      delete [] map_bucket;
-      delete [] map_hash;
+      delete[] map_bucket;
+      delete[] map_hash;
       map_bucket = nullptr;
       map_hash = nullptr;
     }
diff --git a/src/atom_vec_ellipsoid.cpp b/src/atom_vec_ellipsoid.cpp
index b806b0d9c25..e67796681f8 100644
--- a/src/atom_vec_ellipsoid.cpp
+++ b/src/atom_vec_ellipsoid.cpp
@@ -26,6 +26,7 @@
 #include "memory.h"
 #include "modify.h"
 
+#include <algorithm>
 #include <cstring>
 
 using namespace LAMMPS_NS;
@@ -35,8 +36,8 @@ static constexpr double EPSILON_BLOCK = 1.0e-3;
 /* ---------------------------------------------------------------------- */
 
 AtomVecEllipsoid::AtomVecEllipsoid(LAMMPS *lmp) :
-    AtomVec(lmp), bonus(nullptr), ellipsoid(nullptr), rmass(nullptr), angmom(nullptr),
-    quat_hold(nullptr), bonus_super(nullptr)
+    AtomVec(lmp), bonus(nullptr), bonus_super(nullptr), ellipsoid(nullptr), rmass(nullptr),
+    angmom(nullptr), quat_hold(nullptr)
 {
   molecular = Atom::ATOMIC;
   bonus_flag = 1;
@@ -598,7 +599,7 @@ void AtomVecEllipsoid::data_atom_bonus(int m, const std::vector<std::string> &va
 
     double *block = bonus_super[nlocal_bonus].block;
     BlockType &type = bonus_super[nlocal_bonus].type;
-    if (ivalue == values.size()) {
+    if (ivalue == (int) values.size()) {
       block[0] = block[1] = 2.0;
       type = BlockType::ELLIPSOID;
     } else {
@@ -1014,7 +1015,7 @@ AtomVecEllipsoid::BlockType AtomVecEllipsoid::determine_type(double *block)
 
 double AtomVecEllipsoid::radius_ellipsoid(double *shape, double *block, BlockType flag_type)
 {
-  if (flag_type == BlockType::ELLIPSOID) return std::max(std::max(shape[0], shape[1]), shape[2]);
+  if (flag_type == BlockType::ELLIPSOID) return std::max({shape[0], shape[1], shape[2]});
 
   // Super ellipsoid
   double a = shape[0], b = shape[1], c = shape[2];
@@ -1099,13 +1100,13 @@ void AtomVecEllipsoid::process_args(int narg, char **arg)
       size_data_bonus = 10;
 
       // Add radius to the arrays for communication
-      fields_grow.push_back("radius");
-      fields_copy.push_back("radius");
-      fields_border.push_back("radius");
-      fields_border_vel.push_back("radius");
-      fields_exchange.push_back("radius");
-      fields_restart.push_back("radius");
-      fields_create.push_back("radius");
+      fields_grow.emplace_back("radius");
+      fields_copy.emplace_back("radius");
+      fields_border.emplace_back("radius");
+      fields_border_vel.emplace_back("radius");
+      fields_exchange.emplace_back("radius");
+      fields_restart.emplace_back("radius");
+      fields_create.emplace_back("radius");
 
       setup_fields();
 
diff --git a/src/bond.cpp b/src/bond.cpp
index 802ac2b6c18..2593536caaf 100644
--- a/src/bond.cpp
+++ b/src/bond.cpp
@@ -105,11 +105,12 @@ void Bond::settings(int narg, char **args)
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
      evflag       != 0 if any bits of eflag or vflag are set
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag set
-     vflag_atom   != 0 if VIRIAL_ATOM or VIRIAL_CENTROID bit of vflag set
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_ATOM or VIRIAL_CENTROID bit of vflag is set
                        two-body and centroid stress are identical for bonds
      vflag_either != 0 if vflag_global or vflag_atom is set
 ------------------------------------------------------------------------- */
@@ -120,9 +121,10 @@ void Bond::ev_setup(int eflag, int vflag, int alloc)
 
   evflag = 1;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_either = vflag;
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
diff --git a/src/bond.h b/src/bond.h
index c844dc9acb0..8804b805774 100644
--- a/src/bond.h
+++ b/src/bond.h
@@ -88,7 +88,7 @@ class Bond : protected Pointers {
   int suffix_flag;    // suffix compatibility flag
 
   int evflag;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom;
   int maxeatom, maxvatom;
 
@@ -97,8 +97,8 @@ class Bond : protected Pointers {
     if (eflag || vflag)
       ev_setup(eflag, vflag, alloc);
     else
-      evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          0;
+      evflag = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either = vflag_global =
+          vflag_atom = 0;
   }
   void ev_setup(int, int, int alloc = 1);
   void ev_tally(int, int, int, int, double, double, double, double, double);
diff --git a/src/compute.cpp b/src/compute.cpp
index dbe190b06d2..fc7e26ed86a 100644
--- a/src/compute.cpp
+++ b/src/compute.cpp
@@ -99,6 +99,7 @@ Compute::Compute(LAMMPS *lmp, int narg, char **arg) :
 
   copymode = 0;
   kokkosable = 0;
+  forward_comm_device = 0;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/compute.h b/src/compute.h
index 1d3d58eb89c..047dc5c2f8f 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -111,6 +111,7 @@ class Compute : protected Pointers {
   uint64_t datamask_read, datamask_modify;
 
   int copymode, kokkosable;
+  int forward_comm_device;    // 1 if forward comm on Device
 
   Compute(class LAMMPS *, int, char **);
   ~Compute() override;
diff --git a/src/dihedral.cpp b/src/dihedral.cpp
index 7be96c8c5dd..f904aa86ef2 100644
--- a/src/dihedral.cpp
+++ b/src/dihedral.cpp
@@ -96,14 +96,15 @@ void Dihedral::settings(int narg, char **args)
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
      evflag       != 0 if any bits of eflag or vflag are set
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag set
-     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag set
-     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag set
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag != CENTROID_AVAIL
-     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag set
+     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag = CENTROID_AVAIL
      vflag_either != 0 if any of vflag_global, vflag_atom, cvflag_atom is set
 ------------------------------------------------------------------------- */
@@ -114,9 +115,10 @@ void Dihedral::ev_setup(int eflag, int vflag, int alloc)
 
   evflag = 1;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
   vflag_atom = vflag & VIRIAL_ATOM;
diff --git a/src/dihedral.h b/src/dihedral.h
index 72895f0b11f..251689d2291 100644
--- a/src/dihedral.h
+++ b/src/dihedral.h
@@ -37,8 +37,8 @@ class Dihedral : protected Pointers {
                              // CENTROID_AVAIL = different and implemented
                              // CENTROID_NOTAVAIL = different, not yet implemented
 
-  int reinitflag;            // 0 if not compatible with fix adapt
-                             // extract() method may still need to be added
+  int reinitflag;    // 0 if not compatible with fix adapt
+                     // extract() method may still need to be added
 
   // KOKKOS host/device flag and data masks
 
@@ -72,7 +72,7 @@ class Dihedral : protected Pointers {
   int suffix_flag;    // suffix compatibility flag
 
   int evflag;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom, cvflag_atom;
   int maxeatom, maxvatom, maxcvatom;
 
@@ -81,8 +81,8 @@ class Dihedral : protected Pointers {
     if (eflag || vflag)
       ev_setup(eflag, vflag, alloc);
     else
-      evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          cvflag_atom = 0;
+      evflag = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either = vflag_global =
+          vflag_atom = cvflag_atom = 0;
   }
   void ev_setup(int, int, int alloc = 1);
   void ev_tally(int, int, int, int, int, int, double, double *, double *, double *, double, double,
diff --git a/src/fix.cpp b/src/fix.cpp
index f02a656ce0e..884ffa854be 100644
--- a/src/fix.cpp
+++ b/src/fix.cpp
@@ -198,6 +198,9 @@ void Fix::set_molecule(int, tagint, int, double *, double *, double *)
    if thermo_energy is not set, energy tallying is disabled
    if thermo_virial is not set, virial tallying is disabled
    global energy is tallied separately, output by compute_scalar() method
+   ENERGY_ONLY flag should only be set manually and may be ignored
+   it is meant to be used for cases where computation of only the
+   energy is *much* faster.
 ------------------------------------------------------------------------- */
 
 void Fix::ev_setup(int eflag, int vflag)
@@ -206,11 +209,12 @@ void Fix::ev_setup(int eflag, int vflag)
 
   evflag = 1;
 
-  if (!thermo_energy) eflag_either = eflag_global = eflag_atom = 0;
+  if (!thermo_energy) eflag_either = eflag_global = eflag_atom = eflag_only = 0;
   else {
-    eflag_either = eflag;
+    eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
     eflag_global = eflag & ENERGY_GLOBAL;
     eflag_atom = eflag & ENERGY_ATOM;
+    eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
   }
 
   if (!thermo_virial) vflag_either = vflag_global = vflag_atom = 0;
diff --git a/src/fix.h b/src/fix.h
index 685399109a3..5f259a666c3 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -49,37 +49,37 @@ class Fix : protected Pointers {
   };
   // clang-format on
 
-  bigint next_reneighbor;      // next timestep to force a reneighboring
-  int nevery;                  // how often to call an end_of_step fix
-  int thermo_energy;           // 1 if fix_modify energy enabled, 0 if not
-  int thermo_virial;           // 1 if fix_modify virial enabled, 0 if not
-  int thermo_modify_colname;   // 1 if fix has custom column names for output
-  int energy_global_flag;      // 1 if contributes to global eng
-  int energy_peratom_flag;     // 1 if contributes to peratom eng
-  int virial_global_flag;      // 1 if contributes to global virial
-  int virial_peratom_flag;     // 1 if contributes to peratom virial
-  int ecouple_flag;            // 1 if thermostat fix outputs cumulative
-                               //      reservoir energy via compute_scalar()
-  int time_integrate;          // 1 if performs time integration, 0 if no
-  int rigid_flag;              // 1 if integrates rigid bodies, 0 if not
-  int no_change_box;           // 1 if cannot swap ortho <-> triclinic
-  int time_depend;             // 1 if requires continuous timestepping
-  int create_attribute;        // 1 if fix stores attributes that need
-                               //      setting when a new atom is created
-  int restart_pbc;             // 1 if fix moves atoms (except integrate)
-                               //      so write_restart must remap to PBC
-  int wd_header;               // # of header values fix writes to data file
-  int wd_section;              // # of sections fix writes to data file
-  int dynamic_group_allow;     // 1 if can be used with dynamic group, else 0
-  int dof_flag;                // 1 if has dof() method (not min_dof())
-  int special_alter_flag;      // 1 if has special_alter() meth for spec lists
-  int respa_level_support;     // 1 if fix supports fix_modify respa
-  int respa_level;             // which respa level to apply fix (1-Nrespa)
-  int maxexchange;             // max # of per-atom values for Comm::exchange()
-  int maxexchange_dynamic;     // 1 if fix sets maxexchange dynamically
-  int pre_exchange_migrate;    // 1 if fix migrates atoms in pre_exchange()
-  int stores_ids;              // 1 if fix stores atom IDs
-  int diam_flag;               // 1 if fix may change partical diameter
+  bigint next_reneighbor;       // next timestep to force a reneighboring
+  int nevery;                   // how often to call an end_of_step fix
+  int thermo_energy;            // 1 if fix_modify energy enabled, 0 if not
+  int thermo_virial;            // 1 if fix_modify virial enabled, 0 if not
+  int thermo_modify_colname;    // 1 if fix has custom column names for output
+  int energy_global_flag;       // 1 if contributes to global eng
+  int energy_peratom_flag;      // 1 if contributes to peratom eng
+  int virial_global_flag;       // 1 if contributes to global virial
+  int virial_peratom_flag;      // 1 if contributes to peratom virial
+  int ecouple_flag;             // 1 if thermostat fix outputs cumulative
+                                //      reservoir energy via compute_scalar()
+  int time_integrate;           // 1 if performs time integration, 0 if no
+  int rigid_flag;               // 1 if integrates rigid bodies, 0 if not
+  int no_change_box;            // 1 if cannot swap ortho <-> triclinic
+  int time_depend;              // 1 if requires continuous timestepping
+  int create_attribute;         // 1 if fix stores attributes that need
+                                //      setting when a new atom is created
+  int restart_pbc;              // 1 if fix moves atoms (except integrate)
+                                //      so write_restart must remap to PBC
+  int wd_header;                // # of header values fix writes to data file
+  int wd_section;               // # of sections fix writes to data file
+  int dynamic_group_allow;      // 1 if can be used with dynamic group, else 0
+  int dof_flag;                 // 1 if has dof() method (not min_dof())
+  int special_alter_flag;       // 1 if has special_alter() meth for spec lists
+  int respa_level_support;      // 1 if fix supports fix_modify respa
+  int respa_level;              // which respa level to apply fix (1-Nrespa)
+  int maxexchange;              // max # of per-atom values for Comm::exchange()
+  int maxexchange_dynamic;      // 1 if fix sets maxexchange dynamically
+  int pre_exchange_migrate;     // 1 if fix migrates atoms in pre_exchange()
+  int stores_ids;               // 1 if fix stores atom IDs
+  int diam_flag;                // 1 if fix may change particle diameter
 
   int scalar_flag;                 // 0/1 if compute_scalar() function exists
   int vector_flag;                 // 0/1 if compute_vector() function exists
@@ -238,7 +238,7 @@ class Fix : protected Pointers {
   virtual double compute_scalar() { return 0.0; }
   virtual double compute_vector(int) { return 0.0; }
   virtual double compute_array(int, int) { return 0.0; }
-  virtual std::string get_thermo_colname(int) { return {};  }
+  virtual std::string get_thermo_colname(int) { return {}; }
 
   virtual bigint dof(int) { return 0; }
   virtual void deform(int) {}
@@ -273,7 +273,7 @@ class Fix : protected Pointers {
   int instance_me;    // which Fix class instantiation I am
 
   int evflag;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom, cvflag_atom;
   int maxeatom, maxvatom, maxcvatom;
 
@@ -287,8 +287,8 @@ class Fix : protected Pointers {
     if ((eflag && thermo_energy) || (vflag && thermo_virial))
       ev_setup(eflag, vflag);
     else
-      evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          cvflag_atom = 0;
+      evflag = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either = vflag_global =
+          vflag_atom = cvflag_atom = 0;
   }
   void ev_setup(int, int);
   void ev_tally(int, int *, double, double, double *);
diff --git a/src/fix_adapt.cpp b/src/fix_adapt.cpp
index 6732dbbb180..6474896a270 100644
--- a/src/fix_adapt.cpp
+++ b/src/fix_adapt.cpp
@@ -816,7 +816,7 @@ void FixAdapt::change_settings()
       // for scaleflag, previous_diam_scale is the scale factor on previous step
 
       if (ad->atomparam == DIAMETER) {
-        double scale;
+        double scale = 1.0;
         double *radius = atom->radius;
         double *rmass = atom->rmass;
         int *mask = atom->mask;
@@ -825,6 +825,12 @@ void FixAdapt::change_settings()
 
         if (scaleflag) scale = value / previous_diam_scale;
 
+        // mass must not become zero and radius must not be negative
+        if (massflag && ((scale == 0.0) || (value == 0.0)))
+          error->all(FLERR, Error::NOLASTLINE, "Fix adapt particle mass has become 0.0");
+        if (!massflag && ((scale < 0.0) || (value < 0.0)))
+          error->all(FLERR, Error::NOLASTLINE, "Fix adapt particle diameter has become negative");
+
         for (i = 0; i < nall; i++) {
           if (mask[i] & groupbit) {
             if (massflag) {
diff --git a/src/fix_nh.cpp b/src/fix_nh.cpp
index 6588553b962..e74327001ca 100644
--- a/src/fix_nh.cpp
+++ b/src/fix_nh.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -40,22 +39,24 @@
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
-static constexpr double DELTAFLIP = 0.1;
-static constexpr double TILTMAX = 1.5;
-static constexpr double EPSILON = 1.0e-6;
+namespace {
+constexpr double DELTAFLIP = 0.1;
+constexpr double TILTMAX = 1.5;
+constexpr double EPSILON = 1.0e-6;
 
-enum{NONE,XYZ,XY,YZ,XZ};
-enum{ISO,ANISO,TRICLINIC};
-enum{NOBIAS,BIAS};
+enum { NONE, XYZ, XY, YZ, XZ };
+enum { ISO, ANISO, TRICLINIC };
+enum { NOBIAS, BIAS };
+}    // namespace
 
 /* ----------------------------------------------------------------------
    NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
  ---------------------------------------------------------------------- */
 
 FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
-    Fix(lmp, narg, arg), id_dilate(nullptr), irregular(nullptr), step_respa(nullptr), id_temp(nullptr),
-    id_press(nullptr), eta(nullptr), eta_dot(nullptr), eta_dotdot(nullptr), eta_mass(nullptr),
-    etap(nullptr), etap_dot(nullptr), etap_dotdot(nullptr), etap_mass(nullptr)
+    Fix(lmp, narg, arg), id_dilate(nullptr), irregular(nullptr), step_respa(nullptr),
+    id_temp(nullptr), id_press(nullptr), eta(nullptr), eta_dot(nullptr), eta_dotdot(nullptr),
+    eta_mass(nullptr), etap(nullptr), etap_dot(nullptr), etap_dotdot(nullptr), etap_mass(nullptr)
 {
   if (narg < 4) utils::missing_cmd_args(FLERR, std::string("fix ") + style, error);
 
@@ -106,9 +107,9 @@ FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
 
   // set fixed-point to default = center of cell
 
-  fixedpoint[0] = 0.5*(domain->boxlo[0]+domain->boxhi[0]);
-  fixedpoint[1] = 0.5*(domain->boxlo[1]+domain->boxhi[1]);
-  fixedpoint[2] = 0.5*(domain->boxlo[2]+domain->boxhi[2]);
+  fixedpoint[0] = 0.5 * (domain->boxlo[0] + domain->boxhi[0]);
+  fixedpoint[1] = 0.5 * (domain->boxlo[1] + domain->boxhi[1]);
+  fixedpoint[2] = 0.5 * (domain->boxlo[2] + domain->boxhi[2]);
 
   // used by FixNVTSllod to preserve non-default value
 
@@ -129,56 +130,57 @@ FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
   int iarg = 3;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"temp") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} temp", style), error);
+    if (strcmp(arg[iarg], "temp") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} temp", style), error);
       tstat_flag = 1;
-      t_start = utils::numeric(FLERR,arg[iarg+1],false,lmp);
+      t_start = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
       t_target = t_start;
-      t_stop = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      t_period = utils::numeric(FLERR,arg[iarg+3],false,lmp);
-      if (t_start <= 0.0 || t_stop <= 0.0)
-        error->all(FLERR, "Target temperature for fix {} cannot be 0.0", style);
+      t_stop = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      t_period = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
+      if (t_start <= 0.0)
+        error->all(FLERR, iarg + 1, "Target temperature for fix {} cannot be <= 0.0", style);
+      if (t_stop <= 0.0)
+        error->all(FLERR, iarg + 2, "Target temperature for fix {} cannot be <= 0.0", style);
+      if (t_period <= 0.0)
+        error->all(FLERR, iarg + 3, "Temperature damping for fix {} cannot be <= 0.0", style);
       iarg += 4;
 
-    } else if (strcmp(arg[iarg],"iso") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} iso", style), error);
+    } else if (strcmp(arg[iarg], "iso") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} iso", style), error);
       pcouple = XYZ;
-      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[0] = p_period[1] = p_period[2] =
-        utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[0] = p_period[1] = p_period[2] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[0] = p_flag[1] = p_flag[2] = 1;
       if (dimension == 2) {
         p_start[2] = p_stop[2] = p_period[2] = 0.0;
         p_flag[2] = 0;
       }
       iarg += 4;
-    } else if (strcmp(arg[iarg],"aniso") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} aniso", style), error);
+    } else if (strcmp(arg[iarg], "aniso") == 0) {
+      if (iarg + 4 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} aniso", style), error);
       pcouple = NONE;
-      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[0] = p_period[1] = p_period[2] =
-        utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[0] = p_period[1] = p_period[2] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[0] = p_flag[1] = p_flag[2] = 1;
       if (dimension == 2) {
         p_start[2] = p_stop[2] = p_period[2] = 0.0;
         p_flag[2] = 0;
       }
       iarg += 4;
-    } else if (strcmp(arg[iarg],"tri") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} tri", style), error);
+    } else if (strcmp(arg[iarg], "tri") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} tri", style), error);
       pcouple = NONE;
       scalexy = scalexz = scaleyz = 0;
-      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[0] = p_period[1] = p_period[2] =
-        utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      p_start[0] = p_start[1] = p_start[2] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[0] = p_stop[1] = p_stop[2] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[0] = p_period[1] = p_period[2] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[0] = p_flag[1] = p_flag[2] = 1;
       p_start[3] = p_start[4] = p_start[5] = 0.0;
       p_stop[3] = p_stop[4] = p_stop[5] = 0.0;
-      p_period[3] = p_period[4] = p_period[5] =
-        utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      p_period[3] = p_period[4] = p_period[5] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[3] = p_flag[4] = p_flag[5] = 1;
       if (dimension == 2) {
         p_start[2] = p_stop[2] = p_period[2] = 0.0;
@@ -189,190 +191,225 @@ FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
         p_flag[4] = 0;
       }
       iarg += 4;
-    } else if (strcmp(arg[iarg],"x") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} x", style), error);
-      p_start[0] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[0] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[0] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+    } else if (strcmp(arg[iarg], "x") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} x", style), error);
+      p_start[0] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[0] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[0] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[0] = 1;
       deviatoric_flag = 1;
       iarg += 4;
-    } else if (strcmp(arg[iarg],"y") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} y", style), error);
-      p_start[1] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[1] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[1] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+    } else if (strcmp(arg[iarg], "y") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} y", style), error);
+      p_start[1] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[1] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[1] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[1] = 1;
       deviatoric_flag = 1;
       iarg += 4;
-    } else if (strcmp(arg[iarg],"z") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} z", style), error);
-      p_start[2] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[2] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[2] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+    } else if (strcmp(arg[iarg], "z") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} z", style), error);
+      p_start[2] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[2] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[2] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[2] = 1;
       deviatoric_flag = 1;
       iarg += 4;
-      if (dimension == 2) error->all(FLERR,"Invalid fix {} command for a 2d simulation", style);
-
-    } else if (strcmp(arg[iarg],"yz") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} yz", style), error);
-      p_start[3] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[3] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[3] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      if (dimension == 2)
+        error->all(FLERR, iarg, "Invalid fix {} command for a 2d simulation", style);
+
+    } else if (strcmp(arg[iarg], "yz") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} yz", style), error);
+      p_start[3] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[3] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[3] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[3] = 1;
       deviatoric_flag = 1;
       scaleyz = 0;
       iarg += 4;
-      if (dimension == 2) error->all(FLERR,"Invalid fix {} command for a 2d simulation", style);
-    } else if (strcmp(arg[iarg],"xz") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} xz", style), error);
-      p_start[4] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[4] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[4] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      if (dimension == 2)
+        error->all(FLERR, iarg, "Invalid fix {} command for a 2d simulation", style);
+    } else if (strcmp(arg[iarg], "xz") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} xz", style), error);
+      p_start[4] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[4] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[4] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[4] = 1;
       deviatoric_flag = 1;
       scalexz = 0;
       iarg += 4;
-      if (dimension == 2) error->all(FLERR,"Invalid fix {} command for a 2d simulation", style);
-    } else if (strcmp(arg[iarg],"xy") == 0) {
-      if (iarg+4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} xy", style), error);
-      p_start[5] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      p_stop[5] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      p_period[5] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      if (dimension == 2)
+        error->all(FLERR, iarg, "Invalid fix {} command for a 2d simulation", style);
+    } else if (strcmp(arg[iarg], "xy") == 0) {
+      if (iarg + 4 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} xy", style), error);
+      p_start[5] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      p_stop[5] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      p_period[5] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       p_flag[5] = 1;
       deviatoric_flag = 1;
       scalexy = 0;
       iarg += 4;
 
-    } else if (strcmp(arg[iarg],"couple") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} couple", style), error);
-      if (strcmp(arg[iarg+1],"xyz") == 0) pcouple = XYZ;
-      else if (strcmp(arg[iarg+1],"xy") == 0) pcouple = XY;
-      else if (strcmp(arg[iarg+1],"yz") == 0) pcouple = YZ;
-      else if (strcmp(arg[iarg+1],"xz") == 0) pcouple = XZ;
-      else if (strcmp(arg[iarg+1],"none") == 0) pcouple = NONE;
-      else error->all(FLERR,"Illegal fix {} couple option: {}", style, arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "couple") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} couple", style), error);
+      if (strcmp(arg[iarg + 1], "xyz") == 0)
+        pcouple = XYZ;
+      else if (strcmp(arg[iarg + 1], "xy") == 0)
+        pcouple = XY;
+      else if (strcmp(arg[iarg + 1], "yz") == 0)
+        pcouple = YZ;
+      else if (strcmp(arg[iarg + 1], "xz") == 0)
+        pcouple = XZ;
+      else if (strcmp(arg[iarg + 1], "none") == 0)
+        pcouple = NONE;
+      else
+        error->all(FLERR, iarg + 1, "Illegal fix {} couple option: {}", style, arg[iarg + 1]);
       iarg += 2;
 
-    } else if (strcmp(arg[iarg],"drag") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} drag", style), error);
-      drag = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      if (drag < 0.0) error->all(FLERR, "Invalid fix {} drag argument: {}", style, drag);
+    } else if (strcmp(arg[iarg], "drag") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} drag", style), error);
+      drag = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      if (drag < 0.0) error->all(FLERR, iarg + 1, "Invalid fix {} drag argument: {}", style, drag);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"ptemp") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} ptemp", style), error);
-      p_temp = utils::numeric(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "ptemp") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} ptemp", style), error);
+      p_temp = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
       p_temp_flag = 1;
-      if (p_temp <= 0.0) error->all(FLERR, "Invalid fix {} ptemp argument: {}", style, p_temp);
+      if (p_temp <= 0.0)
+        error->all(FLERR, iarg + 1, "Invalid fix {} ptemp argument: {}", style, p_temp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"dilate") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} dilate", style), error);
-      if (strcmp(arg[iarg+1],"all") == 0) allremap = 1;
+    } else if (strcmp(arg[iarg], "dilate") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} dilate", style), error);
+      if (strcmp(arg[iarg + 1], "all") == 0)
+        allremap = 1;
       else {
         allremap = 0;
         delete[] id_dilate;
-        id_dilate = utils::strdup(arg[iarg+1]);
+        id_dilate = utils::strdup(arg[iarg + 1]);
         int idilate = group->find(id_dilate);
         if (idilate < 0)
-          error->all(FLERR,"Fix {} dilate group ID {} does not exist", style, id_dilate);
+          error->all(FLERR, iarg + 1, "Fix {} dilate group ID {} does not exist", style, id_dilate);
       }
       iarg += 2;
 
-    } else if (strcmp(arg[iarg],"tchain") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} tchain", style), error);
-      mtchain = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "tchain") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} tchain", style), error);
+      mtchain = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
       // used by FixNVTSllod to preserve non-default value
       mtchain_default_flag = 0;
-      if (mtchain < 1) error->all(FLERR, "Invalid fix {} tchain argument: {}", style, mtchain);
+      if (mtchain < 1)
+        error->all(FLERR, iarg + 1, "Invalid fix {} tchain argument: {}", style, mtchain);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"pchain") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} pchain", style), error);
-      mpchain = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (mpchain < 0) error->all(FLERR, "Invalid fix {} pchain argument: {}", style, mpchain);
+    } else if (strcmp(arg[iarg], "pchain") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} pchain", style), error);
+      mpchain = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      if (mpchain < 0)
+        error->all(FLERR, iarg + 1, "Invalid fix {} pchain argument: {}", style, mpchain);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"mtk") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} mtk", style), error);
-      mtk_flag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "mtk") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} mtk", style), error);
+      mtk_flag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"tloop") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} tloop", style), error);
-      nc_tchain = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nc_tchain < 0) error->all(FLERR, "Invalid fix {} tloop argument: {}", style, nc_tchain);
+    } else if (strcmp(arg[iarg], "tloop") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} tloop", style), error);
+      nc_tchain = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      if (nc_tchain < 0)
+        error->all(FLERR, iarg + 1, "Invalid fix {} tloop argument: {}", style, nc_tchain);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"ploop") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} ploop", style), error);
-      nc_pchain = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nc_pchain < 0) error->all(FLERR, "Invalid fix {} ploop argument: {}", style, nc_pchain);
+    } else if (strcmp(arg[iarg], "ploop") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} ploop", style), error);
+      nc_pchain = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      if (nc_pchain < 0)
+        error->all(FLERR, iarg + 1, "Invalid fix {} ploop argument: {}", style, nc_pchain);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"nreset") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} nreset", style), error);
-      nreset_h0 = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nreset_h0 < 0) error->all(FLERR, "Invalid fix {} nreset argument: {}", style, nreset_h0);
+    } else if (strcmp(arg[iarg], "nreset") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} nreset", style), error);
+      nreset_h0 = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      if (nreset_h0 < 0)
+        error->all(FLERR, iarg + 1, "Invalid fix {} nreset argument: {}", style, nreset_h0);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"scalexy") == 0) {
-      if (iarg+2 > narg)
+    } else if (strcmp(arg[iarg], "scalexy") == 0) {
+      if (iarg + 2 > narg)
         utils::missing_cmd_args(FLERR, fmt::format("fix {} scalexy", style), error);
-      scalexy = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      scalexy = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"scalexz") == 0) {
-      if (iarg+2 > narg)
+    } else if (strcmp(arg[iarg], "scalexz") == 0) {
+      if (iarg + 2 > narg)
         utils::missing_cmd_args(FLERR, fmt::format("fix {} scalexz", style), error);
-      scalexz = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      scalexz = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"scaleyz") == 0) {
-      if (iarg+2 > narg)
+    } else if (strcmp(arg[iarg], "scaleyz") == 0) {
+      if (iarg + 2 > narg)
         utils::missing_cmd_args(FLERR, fmt::format("fix {} scaleyz", style), error);
-      scaleyz = utils::logical(FLERR,arg[iarg+1],false,lmp);
+      scaleyz = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"flip") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} flip", style), error);
-      flipflag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "flip") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} flip", style), error);
+      flipflag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"update") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} update", style), error);
-      if (strcmp(arg[iarg+1],"dipole") == 0) dipole_flag = 1;
-      else if (strcmp(arg[iarg+1],"dipole/dlm") == 0) {
+    } else if (strcmp(arg[iarg], "update") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} update", style), error);
+      if (strcmp(arg[iarg + 1], "dipole") == 0)
+        dipole_flag = 1;
+      else if (strcmp(arg[iarg + 1], "dipole/dlm") == 0) {
         dipole_flag = 1;
         dlm_flag = 1;
-      } else error->all(FLERR, "Invalid fix {} update argument: {}", style, arg[iarg+1]);
+      } else
+        error->all(FLERR, iarg + 1, "Invalid fix {} update argument: {}", style, arg[iarg + 1]);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"isochoric") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, fmt::format("fix {} isochoric", style), error);
-      if (strcmp(arg[iarg+1], "x") == 0) p_isoch[0] = 1;
-      else if (strcmp(arg[iarg+1], "y") == 0) p_isoch[1] = 1;
-      else if (strcmp(arg[iarg+1], "z") == 0) p_isoch[2] = 1;
-      else if (strcmp(arg[iarg+1], "xy") == 0) p_isoch[0] = p_isoch[1] = 1;
-      else if (strcmp(arg[iarg+1], "yz") == 0) p_isoch[1] = p_isoch[2] = 1;
-      else if (strcmp(arg[iarg+1], "xz") == 0) p_isoch[0] = p_isoch[2] = 1;
-      else error->all(FLERR,"Illegal fix {} isochoric option: {}", style, arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "isochoric") == 0) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, fmt::format("fix {} isochoric", style), error);
+      if (strcmp(arg[iarg + 1], "x") == 0)
+        p_isoch[0] = 1;
+      else if (strcmp(arg[iarg + 1], "y") == 0)
+        p_isoch[1] = 1;
+      else if (strcmp(arg[iarg + 1], "z") == 0)
+        p_isoch[2] = 1;
+      else if (strcmp(arg[iarg + 1], "xy") == 0)
+        p_isoch[0] = p_isoch[1] = 1;
+      else if (strcmp(arg[iarg + 1], "yz") == 0)
+        p_isoch[1] = p_isoch[2] = 1;
+      else if (strcmp(arg[iarg + 1], "xz") == 0)
+        p_isoch[0] = p_isoch[2] = 1;
+      else
+        error->all(FLERR, iarg + 1, "Illegal fix {} isochoric option: {}", style, arg[iarg + 1]);
       isochoric = 1;
       iarg += 2;
-    } else if (strcmp(arg[iarg],"fixedpoint") == 0) {
-      if (iarg+4 > narg)
+    } else if (strcmp(arg[iarg], "fixedpoint") == 0) {
+      if (iarg + 4 > narg)
         utils::missing_cmd_args(FLERR, fmt::format("fix {} fixedpoint", style), error);
-      fixedpoint[0] = utils::numeric(FLERR,arg[iarg+1],false,lmp);
-      fixedpoint[1] = utils::numeric(FLERR,arg[iarg+2],false,lmp);
-      fixedpoint[2] = utils::numeric(FLERR,arg[iarg+3],false,lmp);
+      fixedpoint[0] = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      fixedpoint[1] = utils::numeric(FLERR, arg[iarg + 2], false, lmp);
+      fixedpoint[2] = utils::numeric(FLERR, arg[iarg + 3], false, lmp);
       iarg += 4;
 
-    // disc keyword is also parsed in fix/nh/sphere
+      // disc keyword is also parsed in fix/nh/sphere
 
-    } else if (strcmp(arg[iarg],"disc") == 0) {
+    } else if (strcmp(arg[iarg], "disc") == 0) {
       iarg++;
 
-    // keywords erate, strain, and ext are also parsed in fix/nh/uef
+      // keywords erate, strain, and ext are also parsed in fix/nh/uef
 
-    } else if (strcmp(arg[iarg],"erate") == 0) {
+    } else if (strcmp(arg[iarg], "erate") == 0) {
       iarg += 3;
-    } else if (strcmp(arg[iarg],"strain") == 0) {
+    } else if (strcmp(arg[iarg], "strain") == 0) {
       iarg += 3;
-    } else if (strcmp(arg[iarg],"ext") == 0) {
+    } else if (strcmp(arg[iarg], "ext") == 0) {
       iarg += 2;
 
-    // keywords psllod, peculiar, kick and integrator are parsed in fix/nvt/sllod
+      // keywords psllod, peculiar, kick and integrator are parsed in fix/nvt/sllod
 
-    } else if (strcmp(arg[iarg],"psllod") == 0) {
+    } else if (strcmp(arg[iarg], "psllod") == 0) {
       iarg += 2;
     } else if (strcmp(arg[iarg], "peculiar") == 0) {
       iarg += 2;
@@ -381,9 +418,11 @@ FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg], "integrator") == 0) {
       iarg += 2;
 
-    } else error->all(FLERR,"Unknown fix {} keyword: {}", style, arg[iarg]);
+    } else
+      error->all(FLERR, iarg, "Unknown fix {} keyword: {}", style, arg[iarg]);
   }
 
+  // clang-format off
   // error checks
 
   if (dimension == 2 && (p_flag[2] || p_flag[3] || p_flag[4]))
@@ -475,12 +514,9 @@ FixNH::FixNH(LAMMPS *lmp, int narg, char **arg) :
   }
 
   if (isochoric) {
-    for (int i; i < 3; i++) {
-      if (p_flag[i]) {
-        if (p_isoch[i]) {
-          error->all(FLERR,"Cannot use barostated dimension as isochoric dimension.");
-        }
-      }
+    for (int i = 0; i < 3; i++) {
+      if (p_flag[i] && p_isoch[i])
+        error->all(FLERR,"Cannot use barostated dimension as isochoric dimension.");
     }
     if (dimension == 3 && (p_flag[0] + p_flag[1] + p_flag[2] > 2)) {
       error->all(FLERR,"Cannot perform isochoric NPT with all dimensions barostated.");
@@ -1127,10 +1163,6 @@ void FixNH::remap()
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   double *h = domain->h;
-  double old_volume, new_volume;
-
-  if (dimension == 3) old_volume = domain->xprd * domain->yprd * domain->zprd;
-  else old_volume = domain->xprd * domain->yprd;
 
   // omega is not used, except for book-keeping
 
@@ -1439,9 +1471,9 @@ void FixNH::restart(char *buf)
 {
   int n = 0;
   auto *list = (double *) buf;
-  int flag = static_cast<int> (list[n++]);
+  int flag = static_cast<int>(list[n++]);
   if (flag) {
-    int m = static_cast<int> (list[n++]);
+    int m = static_cast<int>(list[n++]);
     if (tstat_flag && m == mtchain) {
       for (int ich = 0; ich < mtchain; ich++)
         eta[ich] = list[n++];
@@ -1449,7 +1481,7 @@ void FixNH::restart(char *buf)
         eta_dot[ich] = list[n++];
     } else n += 2*m;
   }
-  flag = static_cast<int> (list[n++]);
+  flag = static_cast<int>(list[n++]);
   if (flag) {
     omega[0] = list[n++];
     omega[1] = list[n++];
@@ -1465,14 +1497,14 @@ void FixNH::restart(char *buf)
     omega_dot[5] = list[n++];
     vol0 = list[n++];
     t0 = list[n++];
-    int m = static_cast<int> (list[n++]);
+    int m = static_cast<int>(list[n++]);
     if (pstat_flag && m == mpchain) {
       for (int ich = 0; ich < mpchain; ich++)
         etap[ich] = list[n++];
       for (int ich = 0; ich < mpchain; ich++)
         etap_dot[ich] = list[n++];
     } else n+=2*m;
-    flag = static_cast<int> (list[n++]);
+    flag = static_cast<int>(list[n++]);
     if (flag) {
       h0_inv[0] = list[n++];
       h0_inv[1] = list[n++];
@@ -1481,11 +1513,11 @@ void FixNH::restart(char *buf)
       h0_inv[4] = list[n++];
       h0_inv[5] = list[n++];
     }
-    flag = static_cast<int> (list[n++]);
+    flag = static_cast<int>(list[n++]);
     if (flag) {
-      p_isoch[0] = list[n++];
-      p_isoch[1] = list[n++];
-      p_isoch[2] = list[n++];
+      p_isoch[0] = static_cast<int>(list[n++]);
+      p_isoch[1] = static_cast<int>(list[n++]);
+      p_isoch[2] = static_cast<int>(list[n++]);
       vol_start = list[n++];
     }
   }
diff --git a/src/force.h b/src/force.h
index 09d2dca98f5..5a3da0d8708 100644
--- a/src/force.h
+++ b/src/force.h
@@ -26,9 +26,14 @@ class Improper;
 class KSpace;
 class Pair;
 
-enum { ENERGY_NONE = 0x00, ENERGY_GLOBAL = 0x01, ENERGY_ATOM = 0x02 };
-
 // clang-format off
+enum {
+  ENERGY_NONE   = 0x00,
+  ENERGY_GLOBAL = 0x01,
+  ENERGY_ATOM   = 0x02,
+  ENERGY_ONLY   = 0x04
+};
+
 enum {
   VIRIAL_NONE     = 0x00,
   VIRIAL_PAIR     = 0x01,
@@ -36,9 +41,13 @@ enum {
   VIRIAL_ATOM     = 0x04,
   VIRIAL_CENTROID = 0x08
 };
-// clang-format on
 
-enum { CENTROID_SAME = 0, CENTROID_AVAIL = 1, CENTROID_NOTAVAIL = 2 };
+enum {
+  CENTROID_SAME     = 0x00,
+  CENTROID_AVAIL    = 0x01,
+  CENTROID_NOTAVAIL = 0x02
+};
+// clang-format on
 
 class Force : protected Pointers {
  public:
diff --git a/src/improper.cpp b/src/improper.cpp
index ab1d91df85a..d3790dddf28 100644
--- a/src/improper.cpp
+++ b/src/improper.cpp
@@ -96,14 +96,15 @@ void Improper::settings(int narg, char **args)
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
      evflag       != 0 if any bits of eflag or vflag are set
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag set
-     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag set
-     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag set
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag != CENTROID_AVAIL
-     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag set
+     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag = CENTROID_AVAIL
      vflag_either != 0 if any of vflag_global, vflag_atom, cvflag_atom is set
 ------------------------------------------------------------------------- */
@@ -114,9 +115,10 @@ void Improper::ev_setup(int eflag, int vflag, int alloc)
 
   evflag = 1;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
   vflag_atom = vflag & VIRIAL_ATOM;
diff --git a/src/improper.h b/src/improper.h
index da0ae1276f8..1d9bef1d0ce 100644
--- a/src/improper.h
+++ b/src/improper.h
@@ -77,7 +77,7 @@ class Improper : protected Pointers {
   int suffix_flag;    // suffix compatibility flag
 
   int evflag;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom, cvflag_atom;
   int maxeatom, maxvatom, maxcvatom;
 
@@ -86,8 +86,8 @@ class Improper : protected Pointers {
     if (eflag || vflag)
       ev_setup(eflag, vflag, alloc);
     else
-      evflag = eflag_either = eflag_global = eflag_atom = vflag_either = vflag_global = vflag_atom =
-          cvflag_atom = 0;
+      evflag = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either = vflag_global =
+          vflag_atom = cvflag_atom = 0;
   }
   void ev_setup(int, int, int alloc = 1);
   void ev_tally(int, int, int, int, int, int, double, double *, double *, double *, double, double,
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 65a6d4d9d92..6d344c1f4e0 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -223,11 +223,12 @@ void KSpace::pair_check()
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
      evflag       != 0 if any bits of eflag or vflag are set
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag set
-     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag set
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR or VIRIAL_FDOTR bit of vflag is set
+     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag is set
                        no current support for centroid stress
      vflag_either != 0 if vflag_global or vflag_atom is set
      evflag_atom  != 0 if eflag_atom or vflag_atom is set
@@ -239,9 +240,10 @@ void KSpace::ev_setup(int eflag, int vflag, int alloc)
 
   evflag = 1;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_either = vflag;
   vflag_global = vflag & (VIRIAL_PAIR | VIRIAL_FDOTR);
diff --git a/src/kspace.h b/src/kspace.h
index 214d0a63cc1..01ef775a0da 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -128,11 +128,11 @@ class KSpace : protected Pointers {
   uint64_t datamask_read, datamask_modify;
   int copymode;
 
-  int compute_flag;       // 0 if skip compute()
-  int fftbench;           // 0 if skip FFT timing
-  int collective_flag;    // 1 if use MPI collectives for FFT/remap
-  int nonblocking_flag;   // 1 if use MPI_Isend for FFT/remap
-  int stagger_flag;       // 1 if using staggered PPPM grids
+  int compute_flag;        // 0 if skip compute()
+  int fftbench;            // 0 if skip FFT timing
+  int collective_flag;     // 1 if use MPI collectives for FFT/remap
+  int nonblocking_flag;    // 1 if use MPI_Isend for FFT/remap
+  int stagger_flag;        // 1 if using staggered PPPM grids
 
   double splittol;    // tolerance for when to truncate splitting
 
@@ -232,7 +232,7 @@ class KSpace : protected Pointers {
   double **gcons, **dgcons;    // accumulated per-atom energy/virial
 
   int evflag, evflag_atom;
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom;
   int maxeatom, maxvatom;
 
@@ -245,7 +245,7 @@ class KSpace : protected Pointers {
     if (eflag || vflag)
       ev_setup(eflag, vflag, alloc);
     else
-      evflag = evflag_atom = eflag_either = eflag_global = eflag_atom = vflag_either =
+      evflag = evflag_atom = eflag_either = eflag_global = eflag_atom = eflag_only = vflag_either =
           vflag_global = vflag_atom = 0;
   }
   void ev_setup(int, int, int alloc = 1);
diff --git a/src/math_extra.h b/src/math_extra.h
index 8b5373b8a57..bbbdcd25e69 100644
--- a/src/math_extra.h
+++ b/src/math_extra.h
@@ -24,6 +24,7 @@ namespace MathExtra {
 
 // 3 vector operations
 
+inline void copy2(const double *v, double *ans);
 inline void copy3(const double *v, double *ans);
 inline void zero3(double *v);
 inline void norm3(double *v);
@@ -140,6 +141,16 @@ double beta(double x, double y);
    copy a vector, return in ans
 ------------------------------------------------------------------------- */
 
+inline void MathExtra::copy2(const double *v, double *ans)
+{
+  ans[0] = v[0];
+  ans[1] = v[1];
+}
+
+/* ----------------------------------------------------------------------
+   copy a vector, return in ans
+------------------------------------------------------------------------- */
+
 inline void MathExtra::copy3(const double *v, double *ans)
 {
   ans[0] = v[0];
diff --git a/src/pair.cpp b/src/pair.cpp
index ca8eeb0db3f..ac5d8b90558 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -890,16 +890,17 @@ void Pair::map_element2type(int narg, char **arg, bool update_setflag)
    setup for energy, virial computation
    see integrate::ev_set() for bitwise settings of eflag/vflag
    set the following flags, values are otherwise set to 0:
-     eflag_global != 0 if ENERGY_GLOBAL bit of eflag set
-     eflag_atom   != 0 if ENERGY_ATOM bit of eflag set
+     eflag_global != 0 if ENERGY_GLOBAL bit of eflag is set
+     eflag_atom   != 0 if ENERGY_ATOM bit of eflag is set
      eflag_either != 0 if eflag_global or eflag_atom is set
-     vflag_global != 0 if VIRIAL_PAIR bit of vflag set, OR
+     eflag_only   != 0 if ENERGY_GLOBAL and ENERGY_ONLY bits of eflag are set
+     vflag_global != 0 if VIRIAL_PAIR bit of vflag is set, OR
                        if VIRIAL_FDOTR bit of vflag is set but no_virial_fdotr = 1
-     vflag_fdotr  != 0 if VIRIAL_FDOTR bit of vflag set and no_virial_fdotr = 0
-     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag set, OR
-                       if VIRIAL_CENTROID bit of vflag set
+     vflag_fdotr  != 0 if VIRIAL_FDOTR bit of vflag is set and no_virial_fdotr = 0
+     vflag_atom   != 0 if VIRIAL_ATOM bit of vflag is set, OR
+                       if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag != CENTROID_AVAIL
-     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag set
+     cvflag_atom  != 0 if VIRIAL_CENTROID bit of vflag is set
                        and centroidstressflag = CENTROID_AVAIL
      vflag_either != 0 if any of vflag_global, vflag_atom, cvflag_atom is set
      evflag       != 0 if eflag_either or vflag_either is set
@@ -913,9 +914,10 @@ void Pair::ev_setup(int eflag, int vflag, int alloc)
 {
   int i,n;
 
-  eflag_either = eflag;
+  eflag_either = eflag & (ENERGY_GLOBAL | ENERGY_ATOM);
   eflag_global = eflag & ENERGY_GLOBAL;
   eflag_atom = eflag & ENERGY_ATOM;
+  eflag_only = eflag_global ? (eflag & ENERGY_ONLY) : 0;
 
   vflag_global = vflag & VIRIAL_PAIR;
   if (vflag & VIRIAL_FDOTR && no_virial_fdotr_compute == 1) vflag_global = 1;
@@ -1015,6 +1017,7 @@ void Pair::ev_unset()
   eflag_either = 0;
   eflag_global = 0;
   eflag_atom = 0;
+  eflag_only = 0;
 
   vflag_either = 0;
   vflag_global = 0;
diff --git a/src/pair.h b/src/pair.h
index 9b004f27bed..618ba951cdd 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -88,7 +88,7 @@ class Pair : protected Pointers {
   int trim_flag;    // pair_modify flag for trimming neigh list
 
   int evflag;    // energy,virial settings
-  int eflag_either, eflag_global, eflag_atom;
+  int eflag_either, eflag_global, eflag_atom, eflag_only;
   int vflag_either, vflag_global, vflag_atom, cvflag_atom;
 
   int ncoultablebits;    // size of Coulomb table, accessed by KSpace
diff --git a/src/set.cpp b/src/set.cpp
index 6f786ddbd5f..9de1f406b72 100644
--- a/src/set.cpp
+++ b/src/set.cpp
@@ -1125,7 +1125,8 @@ void Set::invoke_apip_lambda(Action *action)
 void Set::process_block(int &iarg, int narg, char **arg, Action *action)
 {
   if (!atom->superellipsoid_flag)
-    error->all(FLERR,"Cannot set attribute {} for atom style {} (available with ellipsoid with superellipsoid flag)", arg[iarg], atom->get_style());
+    error->all(FLERR,"Cannot set attribute {} for atom style {} (only available for ellipsoid "
+               "with superellipsoid flag)", arg[iarg], atom->get_style());
   if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "set block", error);
   if (utils::strmatch(arg[iarg+1],"^v_")) varparse(arg[iarg+1],1,action);
   else {
@@ -1140,11 +1141,11 @@ void Set::process_block(int &iarg, int narg, char **arg, Action *action)
   iarg += 3;
 }
 
-
 void Set::invoke_block(Action *action)
 {
   int nlocal = atom->nlocal;
   auto *avec_ellipsoid = dynamic_cast<AtomVecEllipsoid *>(atom->style_match("ellipsoid"));
+  if (!avec_ellipsoid) return;
 
   int varflag = action->varflag;
   double block1 = 0.0, block2 = 0.0;
diff --git a/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml b/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
index 0e00e418a19..230a5595b1d 100644
--- a/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
+++ b/unittest/force-styles/tests/atomic-pair-lepton_sphere.yaml
@@ -3,7 +3,7 @@ lammps_version: 28 Mar 2023
 date_generated: Fri Apr  7 18:04:29 2023
 tags: unstable
 epsilon: 7.5e-13
-skip_tests: single
+skip_tests:
 prerequisites: ! |
   pair lepton/sphere
   atom sphere
diff --git a/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml b/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
index 193a65122e7..90102088e9c 100644
--- a/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
+++ b/unittest/force-styles/tests/atomic-pair-lj_cut_sphere.yaml
@@ -3,7 +3,7 @@ lammps_version: 28 Mar 2023
 date_generated: Thu Mar 30 14:38:22 2023
 tags: unstable
 epsilon: 7.5e-13
-skip_tests: single
+skip_tests:
 prerequisites: ! |
   pair lj/cut/sphere
   atom sphere
diff --git a/unittest/force-styles/tests/fix-timestep-efield_variable.yaml b/unittest/force-styles/tests/fix-timestep-efield_variable.yaml
index 1ec1e4098ae..b08f70b55e6 100644
--- a/unittest/force-styles/tests/fix-timestep-efield_variable.yaml
+++ b/unittest/force-styles/tests/fix-timestep-efield_variable.yaml
@@ -1,7 +1,7 @@
 ---
 lammps_version: 27 Jun 2024
 date_generated: Sat Aug  3 05:18:19 2024
-epsilon: 2e-13
+epsilon: 5.0e-13
 skip_tests:
 prerequisites: ! |
   atom full