Source code for mdpy.constraint.electrostatic_pme_constraint

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
file : electrostatic_pme_constraint.py
created time : 2022/04/07
author : Zhenyu Wei
copyright : (C)Copyright 2021-present, mdpy organization
"""

import math
import numpy as np
import numba as nb
import cupy as cp
from numba import cuda
from mdpy import SPATIAL_DIM
from mdpy.environment import *
from mdpy.core import Ensemble
from mdpy.core import NUM_PARTICLES_PER_TILE
from mdpy.constraint import Constraint
from mdpy.utils import *
from mdpy.unit import *
from mdpy.error import *

PME_ORDER = 4
THREAD_PER_BLOCK = 64
TILES_PER_THREAD = 4


def bspline(x, order):
    if order == 2:
        return 1 - np.abs(x - 1) if x >= 0 and x <= 2 else 0
    else:
        return x * bspline(x, order - 1) / (order - 1) + (order - x) * bspline(
            x - 1, order - 1
        ) / (order - 1)


def gamma_sum(m, num_grids, order):
    x = np.pi * m / num_grids
    k = np.arange(1, 51)
    res = 1
    res += ((x / (x + np.pi * k)) ** order).sum()
    res += ((x / (x - np.pi * k)) ** order).sum()
    return res


[docs]class ElectrostaticPMEConstraint(Constraint):
[docs]    def __init__(
        self, cutoff_radius=Quantity(8, angstrom), direct_sum_energy_tolerance=1e-5
    ) -> None:
        super().__init__()
        self._cutoff_radius = check_quantity_value(cutoff_radius, default_length_unit)
        self._device_cutoff_radius = cp.array([self._cutoff_radius], CUPY_FLOAT)
        self._direct_sum_energy_tolerance = direct_sum_energy_tolerance
        self._ewald_coefficient = self._get_ewald_coefficient()
        self._device_ewald_coefficient = cp.array([self._ewald_coefficient], CUPY_FLOAT)
        self._k = 4 * np.pi * EPSILON0.value
        self._device_k = cp.array([self._k], CUPY_FLOAT)
        self._device_inverse_k = cp.array([1 / self._k], CUPY_FLOAT)
        # Attribute
        self._grid_size = None
        self._b_grid = None
        self._c_grid = None
        # Kernel
        self._update_pme_direct_part = cuda.jit(
            nb.void(
                NUMBA_FLOAT[::1],  # inverse_k
                NUMBA_FLOAT[::1],  # ewald_coefficient
                NUMBA_FLOAT[::1],  # cutoff_radius
                NUMBA_FLOAT[:, ::1],  # pbc_matrix
                NUMBA_FLOAT[:, ::1],  # sorted_positions
                NUMBA_FLOAT[:, ::1],  # sorted_charges
                NUMBA_BIT[:, ::1],  # exclusion_map
                NUMBA_INT[:, ::1],  # tile_neighbors
                NUMBA_FLOAT[:, ::1],  # sorted_forces
                NUMBA_FLOAT[::1],  # potential_energy
            ),
            fastmath=True,
            max_registers=32,
        )(self._update_pme_direct_part_kernel)
        self._update_excluded_pme_direct_part = cuda.jit(
            nb.void(
                NUMBA_FLOAT[::1],  # inverse_k
                NUMBA_FLOAT[::1],  # ewald_coefficient
                NUMBA_FLOAT[::1],  # cutoff_radius
                NUMBA_FLOAT[:, ::1],  # pbc_matrix
                NUMBA_FLOAT[:, ::1],  # positions
                NUMBA_FLOAT[:, ::1],  # charges
                NUMBA_INT[:, ::1],  # excluded_particles
                NUMBA_FLOAT[:, ::1],  # forces
                NUMBA_FLOAT[::1],  # potential_energy
            )
        )(self._update_excluded_pme_direct_part_kernel)
        self._update_bspline = cuda.jit(
            nb.void(
                NUMBA_FLOAT[:, ::1],  # position
                NUMBA_INT[::1],  # grid_size,
                NUMBA_FLOAT[:, ::1],  # pbc_matrix
                NUMBA_FLOAT[:, :, ::1],  # spline_coefficient
                NUMBA_FLOAT[:, :, ::1],  # spline_derivative_coefficient
                NUMBA_INT[:, :, ::1],  # grid_map
            )
        )(self._update_bspline_kernel)
        self._update_charge_map = cuda.jit(
            nb.void(
                NUMBA_INT[::1],  # num_particles
                NUMBA_FLOAT[:, :, ::1],  # spline_coefficient
                NUMBA_INT[:, :, ::1],  # grid_map
                NUMBA_FLOAT[:, ::1],  # charges
                NUMBA_FLOAT[:, :, ::1],  # charge_map
            )
        )(self._update_charge_map_kernel)
        self._update_electric_potential_map = (
            self._update_reciprocal_electric_potential_map_kernel
        )
        self._update_reciprocal_force = cuda.jit(
            nb.void(
                NUMBA_FLOAT[:, :, ::1],  # spline_coefficient
                NUMBA_FLOAT[:, :, ::1],  # spline_derivative_coefficient
                NUMBA_INT[:, :, ::1],  # grid_map
                NUMBA_FLOAT[:, :, ::1],  # electric_potential_map
                NUMBA_FLOAT[:, ::1],  # charges
                NUMBA_FLOAT[:, ::1],  # forces
                NUMBA_FLOAT[::1],  # potential_energy
            )
        )(self._update_reciprocal_force_kernel)

    def __repr__(self) -> str:
        return "<mdpy.constraint.ElectrostaticPMEConstraint object>"

    def __str__(self) -> str:
        return "PME electrostatic constraint"

    def _get_ewald_coefficient(self):
        """
        Using Newton interation solve ewald coefficient

        Essentially, the Ewald coefficient is mathematical shorthand for 1/2s,
        where s is the width of the Gaussian used to smooth out charges on the grid.
        That width is chosen such that, at the direct space interaction `cutoff_radius`,
        the interaction of two Gaussian-smoothed charges and the interaction of
        two point charges are identical to a precision of `direct_sum_energy_tolerance`,
        which mean the interaction between these 4 charge site are small enough,
        proving truncation yielding no big error

        f(alpha) = erfc(alpha*cutoff_radius) / cutoff_radius - direct_sum_energy_tolerance
        f'(alpha) = - 2/sqrt(pi) * exp[-(alpha*cutoff_radius)^2]

        alpha_new = alpha_old - f(alpha_old) / f'(alpha_old)
        """
        alpha = 0.1
        sqrt_pi = np.sqrt(np.pi)
        while True:
            f = (
                math.erfc(alpha * self._cutoff_radius) / self._cutoff_radius
                - self._direct_sum_energy_tolerance
            )
            df = -2 * np.exp(-((alpha * self._cutoff_radius) ** 2)) / sqrt_pi
            d_alpha = f / df
            if np.abs(d_alpha / alpha) < 1e-5:
                break
            alpha -= d_alpha
        return alpha

    def _get_grid_size(self):
        """
        Set the grid size with spacing around 0.5 angstrom and power to 2 for better fft performance
        """
        pbc_diag = np.diag(
            Quantity(self._parent_ensemble.state.pbc_matrix, default_length_unit)
            .convert_to(angstrom)
            .value
        )
        grid_size = np.ceil(pbc_diag / 32) * 32
        return grid_size.astype(NUMPY_INT)

    def _get_b_grid(self):
        b_grid = np.zeros(self._grid_size)
        b_factor = []
        for i in range(SPATIAL_DIM):
            num_grids = self._grid_size[i]
            half_num_grids = num_grids // 2
            # denominator
            # Calculate the abs of denminator directly
            denominator = np.zeros([num_grids])
            prefactor = np.zeros([3])
            prefactor[0] = bspline(1, PME_ORDER)
            prefactor[1] = bspline(2, PME_ORDER)
            prefactor[2] = bspline(3, PME_ORDER)
            k_vec = np.arange(3)
            for m in np.arange(num_grids):
                factor = 2 * np.pi * m * k_vec / num_grids
                sin_term = (prefactor * np.sin(factor)).sum()
                cos_term = (prefactor * np.cos(factor)).sum()
                denominator[m] = sin_term**2 + cos_term**2
            # Handle small denominator
            for i in range(num_grids):
                if denominator[i] <= 1e-7:
                    if i != 0 and i != num_grids - 1:
                        # Replace with mean value of two neighbor points
                        denominator[i] = (denominator[i - 1] + denominator[i + 1]) / 2
                    else:
                        # Boundary point set to a very high value, 0 for denominator
                        denominator[i] = 1e7
            # nominator
            nominator = np.zeros([num_grids])
            for index in range(num_grids):
                if index >= half_num_grids:
                    m = index - num_grids
                else:
                    m = index
                if m == 0:
                    nominator[index] = 1
                else:
                    nominator[index] = gamma_sum(m, num_grids, PME_ORDER) / gamma_sum(
                        m, num_grids, 8
                    )
            b_factor.append(nominator**2 / denominator)
        for i in range(self._grid_size[0]):
            for j in range(self._grid_size[1]):
                for k in range(self._grid_size[2]):
                    b_grid[i, j, k] = b_factor[0][i] * b_factor[1][j] * b_factor[2][k]
        return b_grid

    def _get_c_grid(self):
        c_grid = np.zeros(self._grid_size)
        freqency_factor, exp_factor = [], []
        for i in range(SPATIAL_DIM):
            num_grids = self._grid_size[i]
            half_num_grids = num_grids // 2
            m_vec = np.arange(num_grids)
            m_vec[m_vec > half_num_grids] -= num_grids
            m_vec = m_vec / self._parent_ensemble.state.pbc_matrix[i, i]
            exp_vec = np.exp(-((np.pi * m_vec / self._ewald_coefficient) ** 2))
            freqency_factor.append(m_vec)
            exp_factor.append(exp_vec)
        for i in range(self._grid_size[0]):
            for j in range(self._grid_size[1]):
                for k in range(self._grid_size[2]):
                    if i == 0 and j == 0 and k == 0:
                        continue
                    c_grid[i, j, k] = (
                        exp_factor[0][i] * exp_factor[1][j] * exp_factor[2][k]
                    )
                    c_grid[i, j, k] = c_grid[i, j, k] / (
                        freqency_factor[0][i] ** 2
                        + freqency_factor[1][j] ** 2
                        + freqency_factor[2][k] ** 2
                    )
        c_grid = c_grid / (  # / pi V
            np.prod(np.diagonal(self._parent_ensemble.state.pbc_matrix))
            / self._num_grids_total
            * np.pi
        )
        return c_grid

    def _get_self_potential_energy(self):
        potential_energy = (
            (self._parent_ensemble.topology.charges**2).sum()
            * self._ewald_coefficient
            / self._k
            / np.sqrt(np.pi)
        )
        return potential_energy

    def bind_ensemble(self, ensemble: Ensemble):
        self._parent_ensemble = ensemble
        self._constraint_id = ensemble.constraints.index(self)
        if not np.isclose(self._parent_ensemble.topology.charges.sum(), 0, atol=1e-3):
            raise EnsemblePoorDefinedError(
                "mdpy.constraint.ElectrostaticPMEConstraint is bound to a non-neutralized ensemble"
            )
        # Grid size
        self._grid_size = self._get_grid_size()
        self._num_grids_total = np.prod(self._grid_size)
        # create b grid
        self._b_grid = self._get_b_grid()
        # create c grid
        self._c_grid = self._get_c_grid()
        # b_grid * c_grid
        self._bc_grid = self._b_grid * self._c_grid
        # calculate self energy correction
        self._self_potential_energy = self._get_self_potential_energy()
        # device attributes
        self._device_grid_size = cp.array(self._grid_size, CUPY_INT)
        self._device_num_particles = cp.array(
            [self._parent_ensemble.topology.num_particles], CUPY_INT
        )
        self._device_self_potential_energy = cp.array(
            self._self_potential_energy, CUPY_FLOAT
        )
        self._device_b_grid = cp.array(self._b_grid, CUPY_FLOAT)
        self._device_c_grid = cp.array(self._c_grid, CUPY_FLOAT)
        self._device_bc_grid = cp.array(self._bc_grid, CUPY_FLOAT)
        self._device_reciprocal_factor = cp.array(
            self._grid_size / np.diagonal(self._parent_ensemble.state.pbc_matrix),
            CUPY_FLOAT,
        )

    @staticmethod
    def _update_pme_direct_part_kernel(
        inverse_k,
        ewald_coefficient,
        cutoff_radius,
        pbc_matrix,
        sorted_positions,
        sorted_charges,
        exclusion_map,
        tile_neighbors,
        sorted_forces,
        potential_energy,
    ):
        # Particle index information
        local_thread_x = cuda.threadIdx.x
        local_thread_y = cuda.threadIdx.y
        tile_id1 = cuda.blockIdx.x * TILES_PER_THREAD + local_thread_y
        if tile_id1 >= tile_neighbors.shape[0]:
            return
        tile1_particle_index = tile_id1 * NUM_PARTICLES_PER_TILE + local_thread_x
        # shared data
        local_pbc_matrix = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        local_half_pbc_matrix = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        for i in range(SPATIAL_DIM):
            local_pbc_matrix[i] = pbc_matrix[i, i]
            local_half_pbc_matrix[i] = local_pbc_matrix[i] * NUMBA_FLOAT(0.5)
        tile1_positions = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        tile2_positions = cuda.shared.array(
            (SPATIAL_DIM, TILES_PER_THREAD, NUM_PARTICLES_PER_TILE),
            NUMBA_FLOAT,
        )
        tile2_charges = cuda.shared.array(
            (TILES_PER_THREAD, NUM_PARTICLES_PER_TILE), NUMBA_FLOAT
        )
        cuda.syncthreads()
        # Read data
        for i in range(SPATIAL_DIM):
            tile1_positions[i] = sorted_positions[i, tile1_particle_index]
        tile1_charges = sorted_charges[0, tile1_particle_index]
        # Local data
        local_forces = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        vec = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        energy = NUMBA_FLOAT(0)
        inverse_k = inverse_k[0]
        inverse_sqrt_pi = NUMBA_FLOAT(1) / math.sqrt(NUMBA_FLOAT(math.pi))
        ewald_coefficient = ewald_coefficient[0]
        cutoff_radius = cutoff_radius[0]
        for i in range(SPATIAL_DIM):
            local_forces[i] = 0
        for neighbor_index in range(tile_neighbors.shape[1]):
            tile_id2 = tile_neighbors[tile_id1, neighbor_index]
            tile2_particle_index = tile_id2 * NUM_PARTICLES_PER_TILE + local_thread_x
            cuda.syncwarp()
            for i in range(SPATIAL_DIM):
                tile2_positions[i, local_thread_y, local_thread_x] = sorted_positions[
                    i, tile2_particle_index
                ]
            tile2_charges[local_thread_y, local_thread_x] = sorted_charges[
                0, tile2_particle_index
            ]
            exclusion_flag = exclusion_map[neighbor_index, tile1_particle_index]
            cuda.syncwarp()
            if tile_id2 == -1:
                break
            # Computation
            for particle_index in range(NUM_PARTICLES_PER_TILE):
                if exclusion_flag >> particle_index & 0b1:
                    continue
                r = NUMBA_FLOAT(0)
                for i in range(SPATIAL_DIM):
                    vec[i] = (
                        tile2_positions[i, local_thread_y, particle_index]
                        - tile1_positions[i]
                    )
                    if vec[i] < -local_half_pbc_matrix[i]:
                        vec[i] += local_pbc_matrix[i]
                    elif vec[i] > local_half_pbc_matrix[i]:
                        vec[i] -= local_pbc_matrix[i]
                    r += vec[i] ** 2
                r = math.sqrt(r)
                if r < cutoff_radius:
                    ewald_r = ewald_coefficient * r
                    e1e2_over_k = (
                        tile1_charges
                        * tile2_charges[local_thread_y, particle_index]
                        * inverse_k
                    )
                    inverse_r = NUMBA_FLOAT(1) / r
                    erfc_over_r = math.erfc(ewald_r) * inverse_r
                    energy += e1e2_over_k * erfc_over_r * NUMBA_FLOAT(0.5)
                    force_val = (
                        -e1e2_over_k
                        * (
                            NUMBA_FLOAT(2)
                            * ewald_coefficient
                            * math.exp(-((ewald_r) ** 2))
                            * inverse_sqrt_pi
                            + erfc_over_r
                        )
                        * inverse_r
                        * inverse_r
                    )
                    for i in range(SPATIAL_DIM):
                        local_forces[i] += force_val * vec[i]
        for i in range(SPATIAL_DIM):
            cuda.atomic.add(sorted_forces, (i, tile1_particle_index), local_forces[i])
        cuda.atomic.add(potential_energy, 0, energy)

    @staticmethod
    def _update_excluded_pme_direct_part_kernel(
        inverse_k,
        ewald_coefficient,
        cutoff_radius,
        pbc_matrix,
        positions,
        charges,
        excluded_particles,
        forces,
        potential_energy,
    ):
        particle1 = cuda.grid(1)
        local_thread_x = cuda.threadIdx.x
        if particle1 >= positions.shape[0]:
            return
        shared_pbc_matrix = cuda.shared.array((SPATIAL_DIM), NUMBA_FLOAT)
        shared_half_pbc_matrix = cuda.shared.array((SPATIAL_DIM), NUMBA_FLOAT)
        if local_thread_x <= 2:
            shared_pbc_matrix[local_thread_x] = pbc_matrix[
                local_thread_x, local_thread_x
            ]
            shared_half_pbc_matrix[local_thread_x] = shared_pbc_matrix[
                local_thread_x
            ] * NUMBA_FLOAT(0.5)
        cuda.syncthreads()
        local_positions = cuda.local.array((3), NUMBA_FLOAT)
        local_forces = cuda.local.array((3), NUMBA_FLOAT)
        vec = cuda.local.array((3), NUMBA_FLOAT)
        e1 = charges[particle1, 0]
        inverse_k = inverse_k[0]
        ewald_coefficient = ewald_coefficient[0]
        cutoff_radius = cutoff_radius[0]
        inverse_sqrt_pi = NUMBA_FLOAT(1) / math.sqrt(NUMBA_FLOAT(math.pi))
        for i in range(SPATIAL_DIM):
            local_positions[i] = positions[particle1, i]
            local_forces[i] = 0
        energy = NUMBA_FLOAT(0)
        is_excluded = False  # Prevent atomic add for no excluded particles
        for i in range(excluded_particles.shape[1]):
            particle2 = excluded_particles[particle1, i]
            if particle2 == -1:
                break
            is_excluded = True
            r = NUMBA_FLOAT(0)
            for i in range(SPATIAL_DIM):
                vec[i] = positions[particle2, i] - local_positions[i]
                if vec[i] < -shared_half_pbc_matrix[i]:
                    vec[i] += shared_pbc_matrix[i]
                elif vec[i] > shared_half_pbc_matrix[i]:
                    vec[i] -= shared_pbc_matrix[i]
                r += vec[i] ** 2
            r = math.sqrt(r)
            if r < cutoff_radius:
                ewald_r = ewald_coefficient * r
                e1e2_over_k = e1 * charges[particle2, 0] * inverse_k
                inverse_r = NUMBA_FLOAT(1) / r
                erf_over_r = math.erf(ewald_r) * inverse_r
                force_val = (
                    e1e2_over_k
                    * (
                        NUMBA_FLOAT(2)
                        * ewald_coefficient
                        * math.exp(-((ewald_r) ** 2))
                        * inverse_sqrt_pi
                        - erf_over_r
                    )
                    * inverse_r
                    * inverse_r
                )
                for i in range(SPATIAL_DIM):
                    local_forces[i] -= force_val * vec[i]
                energy -= e1e2_over_k * erf_over_r * NUMBA_FLOAT(0.5)
        if is_excluded:
            for i in range(SPATIAL_DIM):
                cuda.atomic.add(forces, (particle1, i), local_forces[i])
            cuda.atomic.add(potential_energy, 0, energy)

    @staticmethod
    def _update_bspline_kernel(
        positions,
        grid_size,
        pbc_matrix,
        spline_coefficient,
        spline_derivative_coefficient,
        grid_map,
    ):
        particle_id = cuda.grid(1)
        thread_x = cuda.threadIdx.x
        if particle_id >= positions.shape[0]:
            return None
        # Shared array
        # PBC matrix
        shared_pbc_matrix = cuda.shared.array((SPATIAL_DIM), NUMBA_FLOAT)
        shared_half_pbc_matrix = cuda.shared.array((SPATIAL_DIM), NUMBA_FLOAT)
        # num_cell_vec
        shared_grid_size = cuda.shared.array((3), NUMBA_INT)
        if thread_x == 0:
            shared_pbc_matrix[0] = pbc_matrix[0, 0]
            shared_pbc_matrix[1] = pbc_matrix[1, 1]
            shared_pbc_matrix[2] = pbc_matrix[2, 2]
            shared_half_pbc_matrix[0] = shared_pbc_matrix[0] / 2
            shared_half_pbc_matrix[1] = shared_pbc_matrix[1] / 2
            shared_half_pbc_matrix[2] = shared_pbc_matrix[2] / 2
        elif thread_x == 1:
            shared_grid_size[0] = grid_size[0]
            shared_grid_size[1] = grid_size[1]
            shared_grid_size[2] = grid_size[2]
        cuda.syncthreads()
        position_x = positions[particle_id, 0] + shared_half_pbc_matrix[0]
        position_y = positions[particle_id, 1] + shared_half_pbc_matrix[1]
        position_z = positions[particle_id, 2] + shared_half_pbc_matrix[2]
        grid_index_x = position_x / shared_pbc_matrix[0] * shared_grid_size[0]
        grid_index_y = position_y / shared_pbc_matrix[1] * shared_grid_size[1]
        grid_index_z = position_z / shared_pbc_matrix[2] * shared_grid_size[2]
        grid_index = cuda.local.array((SPATIAL_DIM), NUMBA_INT)
        grid_index[0] = math.floor(grid_index_x)
        grid_index[1] = math.floor(grid_index_y)
        grid_index[2] = math.floor(grid_index_z)
        grid_fraction = cuda.local.array((SPATIAL_DIM), NUMBA_FLOAT)
        grid_fraction[0] = grid_index_x - grid_index[0]
        grid_fraction[1] = grid_index_y - grid_index[1]
        grid_fraction[2] = grid_index_z - grid_index[2]

        local_spline_coefficient = cuda.local.array(
            (SPATIAL_DIM, PME_ORDER), NUMBA_FLOAT
        )
        local_spline_derivative_coefficient = cuda.local.array(
            (SPATIAL_DIM, PME_ORDER), NUMBA_FLOAT
        )
        # 3 order B-spline
        for i in range(SPATIAL_DIM):
            local_spline_coefficient[i, 2] = NUMBA_FLOAT(0.5) * grid_fraction[i] ** 2
            local_spline_coefficient[i, 0] = (
                NUMBA_FLOAT(0.5) * (1 - grid_fraction[i]) ** 2
            )
            local_spline_coefficient[i, 1] = (
                NUMBA_FLOAT(1)
                - local_spline_coefficient[i, 0]
                - local_spline_coefficient[i, 2]
            )
        # 4 order derivative coefficient
        for i in range(SPATIAL_DIM):
            local_spline_derivative_coefficient[i, 0] = -local_spline_coefficient[i, 0]
            local_spline_derivative_coefficient[i, 1] = (
                local_spline_coefficient[i, 0] - local_spline_coefficient[i, 1]
            )
            local_spline_derivative_coefficient[i, 2] = (
                local_spline_coefficient[i, 1] - local_spline_coefficient[i, 2]
            )
            local_spline_derivative_coefficient[i, 3] = local_spline_coefficient[i, 2]
        # 4 order spline coefficient
        for i in range(SPATIAL_DIM):
            local_spline_coefficient[i, 3] = (
                grid_fraction[i] * local_spline_coefficient[i, 2] / NUMBA_FLOAT(3)
            )
            local_spline_coefficient[i, 2] = (
                (NUMBA_FLOAT(1) + grid_fraction[i]) * local_spline_coefficient[i, 1]
                + (NUMBA_FLOAT(3) - grid_fraction[i]) * local_spline_coefficient[i, 2]
            ) / 3
            local_spline_coefficient[i, 0] = (
                (NUMBA_FLOAT(1) - grid_fraction[i])
                * local_spline_coefficient[i, 0]
                / NUMBA_FLOAT(3)
            )
            local_spline_coefficient[i, 1] = NUMBA_FLOAT(1) - (
                local_spline_coefficient[i, 0]
                + local_spline_coefficient[i, 2]
                + local_spline_coefficient[i, 3]
            )
        # Set value
        for i in range(SPATIAL_DIM):
            for j in range(PME_ORDER):
                cuda.atomic.add(
                    spline_coefficient,
                    (particle_id, i, j),
                    local_spline_coefficient[i, j],
                )
                cuda.atomic.add(
                    spline_derivative_coefficient,
                    (particle_id, i, j),
                    local_spline_derivative_coefficient[i, j],
                )
                index = grid_index[i] + j - 1
                if index >= shared_grid_size[i]:
                    index -= shared_grid_size[i]
                elif index < 0:
                    index += shared_grid_size[i]
                cuda.atomic.add(grid_map, (particle_id, i, j), index)

    @staticmethod
    def _update_charge_map_kernel(
        num_particles, spline_coefficient, grid_map, charges, charge_map
    ):
        """
        spline_coefficient: [num_particles, SPATIAL_DIM, PME_ORDER] The spline coefficient of particles
        grid_map: [num_particles, SPATIL_DIM, PME_ORDER]: The indice of grid to add charge of each particles
        charges: [num_particles,]: The charges of particles
        charge_map: [grid_size_x, grid_size_y, grid_size_z]
        """
        particle_id = cuda.grid(1)
        num_particles = num_particles[0]
        if particle_id >= num_particles:
            return None
        charge = charges[particle_id, 0]
        for i in range(PME_ORDER):
            grid_x = grid_map[particle_id, 0, i]
            charge_x = charge * spline_coefficient[particle_id, 0, i]
            for j in range(PME_ORDER):
                grid_y = grid_map[particle_id, 1, j]
                charge_xy = charge_x * spline_coefficient[particle_id, 1, j]
                for k in range(PME_ORDER):
                    grid_z = grid_map[particle_id, 2, k]
                    charge_xyz = charge_xy * spline_coefficient[particle_id, 2, k]
                    cuda.atomic.add(charge_map, (grid_x, grid_y, grid_z), charge_xyz)

    @staticmethod
    def _update_reciprocal_electric_potential_map_kernel(k, charge_map, bc_grid):
        # Convolution
        fft_charge = cp.fft.fftn(charge_map / k)
        fft_charge[0, 0, 0] = 0
        electric_potential_map = cp.real(cp.fft.ifftn(fft_charge * bc_grid))
        return electric_potential_map.astype(CUPY_FLOAT)

    @staticmethod
    def _update_reciprocal_force_kernel(
        spline_coefficient,
        spline_derivative_coefficient,
        grid_map,
        electric_potential_map,
        charges,
        forces,
        potential_energy,
    ):
        """
        spline_coefficient: [num_particles, SPATIAL_DIM, PME_ORDER] The spline coefficient of particles
        spline_derivative_coefficient: [num_particles, SPATIAL_DIM, PME_ORDER] The spline derivative coefficient of particles
        grid_map: [num_particles, SPATIL_DIM, PME_ORDER]: The indice of grid to add charge of each particles
        electric_potential_map: [grid_size_x, grid_size_y, grid_size_z]: Electric potential on each grid point
        forces: [num_particles, SPATIAL_DIM]
        """
        particle_id = cuda.grid(1)
        if particle_id >= charges.shape[0]:
            return None
        charge = charges[particle_id, 0]
        force_x = 0
        force_y = 0
        force_z = 0
        energy = 0
        for i in range(PME_ORDER):
            grid_x = grid_map[particle_id, 0, i]
            spline_coefficient_x = spline_coefficient[particle_id, 0, i]
            spline_derivative_coefficient_x = spline_derivative_coefficient[
                particle_id, 0, i
            ]
            for j in range(PME_ORDER):
                grid_y = grid_map[particle_id, 1, j]
                spline_coefficient_y = spline_coefficient[particle_id, 1, j]
                spline_derivative_coefficient_y = spline_derivative_coefficient[
                    particle_id, 1, j
                ]
                for k in range(PME_ORDER):
                    grid_z = grid_map[particle_id, 2, k]
                    spline_coefficient_z = spline_coefficient[particle_id, 2, k]
                    spline_derivative_coefficient_z = spline_derivative_coefficient[
                        particle_id, 2, k
                    ]
                    cur_energy = (
                        spline_coefficient_x
                        * spline_coefficient_y
                        * spline_coefficient_z
                        * electric_potential_map[grid_x, grid_y, grid_z]
                    )
                    force_x -= (
                        cur_energy
                        / spline_coefficient_x
                        * spline_derivative_coefficient_x
                    )
                    force_y -= (
                        cur_energy
                        / spline_coefficient_y
                        * spline_derivative_coefficient_y
                    )
                    force_z -= (
                        cur_energy
                        / spline_coefficient_z
                        * spline_derivative_coefficient_z
                    )
                    energy += cur_energy
        cuda.atomic.add(forces, (particle_id, 0), force_x * charge)
        cuda.atomic.add(forces, (particle_id, 1), force_y * charge)
        cuda.atomic.add(forces, (particle_id, 2), force_z * charge)
        cuda.atomic.add(potential_energy, 0, energy)

    def update(self):
        self._check_bound_state()
        # Direct part
        self._direct_potential_energy = cp.zeros([1], CUPY_FLOAT)
        sorted_forces = cp.zeros(
            (
                SPATIAL_DIM,
                self._parent_ensemble.tile_list.num_tiles * NUM_PARTICLES_PER_TILE,
            ),
            CUPY_FLOAT,
        )
        thread_per_block = (NUM_PARTICLES_PER_TILE, TILES_PER_THREAD)
        block_per_grid = int(
            np.ceil(self._parent_ensemble.tile_list.num_tiles / TILES_PER_THREAD)
        )
        self._update_pme_direct_part[block_per_grid, thread_per_block](
            self._device_inverse_k,
            self._device_ewald_coefficient,
            self._device_cutoff_radius,
            self._parent_ensemble.state.device_pbc_matrix,
            self._parent_ensemble.state.sorted_positions,
            self._parent_ensemble.topology.device_sorted_charges,
            self._parent_ensemble.topology.device_exclusion_map,
            self._parent_ensemble.tile_list.tile_neighbors,
            sorted_forces,
            self._direct_potential_energy,
        )
        self._direct_forces = self._parent_ensemble.tile_list.unsort_matrix(
            sorted_forces
        )
        thread_per_block = 64
        block_per_grid = int(
            np.ceil(self._parent_ensemble.topology.num_particles / thread_per_block)
        )
        self._update_excluded_pme_direct_part[block_per_grid, thread_per_block](
            self._device_inverse_k,
            self._device_ewald_coefficient,
            self._device_cutoff_radius,
            self._parent_ensemble.state.device_pbc_matrix,
            self._parent_ensemble.state.positions,
            self._parent_ensemble.topology.device_charges,
            self._parent_ensemble.topology.device_excluded_particles,
            self._direct_forces,
            self._direct_potential_energy,
        )
        # Reciprocal part
        thread_per_block = 128
        block_per_grid = int(
            np.ceil(self._parent_ensemble.topology.num_particles / thread_per_block)
        )
        # Bspline
        spline_coefficient = cp.zeros(
            [self._parent_ensemble.topology.num_particles, SPATIAL_DIM, PME_ORDER],
            CUPY_FLOAT,
        )
        spline_derivative_coefficient = cp.zeros(
            [self._parent_ensemble.topology.num_particles, SPATIAL_DIM, PME_ORDER],
            CUPY_FLOAT,
        )
        grid_map = cp.zeros(
            [self._parent_ensemble.topology.num_particles, SPATIAL_DIM, PME_ORDER],
            CUPY_INT,
        )
        self._update_bspline[block_per_grid, thread_per_block](
            self._parent_ensemble.state.positions,
            self._device_grid_size,
            self._parent_ensemble.state.device_pbc_matrix,
            spline_coefficient,
            spline_derivative_coefficient,
            grid_map,
        )
        # Map charge
        self._charge_map = cp.zeros(self._grid_size, CUPY_FLOAT)
        self._update_charge_map[block_per_grid, thread_per_block](
            self._device_num_particles,
            spline_coefficient,
            grid_map,
            self._parent_ensemble.topology.device_charges,
            self._charge_map,
        )
        # Reciprocal convolution
        self._electric_potential_map = self._update_electric_potential_map(
            self._device_k, self._charge_map, self._device_bc_grid
        )
        # Reciprocal force
        self._reciprocal_forces = cp.zeros(
            self._parent_ensemble.state.matrix_shape, CUPY_FLOAT
        )
        self._reciprocal_potential_energy = cp.zeros([1], CUPY_FLOAT)
        self._update_reciprocal_force[block_per_grid, thread_per_block](
            spline_coefficient,
            spline_derivative_coefficient,
            grid_map,
            self._electric_potential_map,
            self._parent_ensemble.topology.device_charges,
            self._reciprocal_forces,
            self._reciprocal_potential_energy,
        )
        self._reciprocal_forces = (
            self._reciprocal_forces * self._device_reciprocal_factor
        )
        self._reciprocal_forces -= self._reciprocal_forces.mean(0)
        # Summary
        self._potential_energy = (
            self._direct_potential_energy
            + self._reciprocal_potential_energy
            - self._device_self_potential_energy
        )
        self._forces = self._direct_forces + self._reciprocal_forces

    @property
    def grid_size(self):
        return self._grid_size

    @property
    def ewald_coefficient(self):
        return self._ewald_coefficient


if __name__ == "__main__":
    import os
    import mdpy as md
    from mdpy.unit import *

    data_dir = (
        "/home/zhenyuwei/nutstore/ZhenyuWei/Note_Research/mdpy/mdpy/benchmark/data"
    )
    psf_file = os.path.join(data_dir, "str.psf")
    pdb_file = os.path.join(data_dir, "str.pdb")
    cutoff_radius = Quantity(9, angstrom)
    # IO
    psf = md.io.PSFParser(psf_file)
    pdb = md.io.PDBParser(pdb_file)
    charmm_prm = md.io.CharmmTopparParser(
        os.path.join(data_dir, "par_all36_prot.prm"),
        os.path.join(data_dir, "toppar_water_ions_namd.str"),
    )
    # constraint
    ensemble = md.core.Ensemble(psf.topology, pdb.pbc_matrix)
    constraint = ElectrostaticPMEConstraint()
    ensemble.add_constraints(constraint)
    ensemble.state.set_positions(pdb.positions)
    ensemble.update_tile_list()
    constraint.update()
    # print(constraint.forces)
    print(
        Quantity(constraint.forces, default_force_unit)
        .convert_to(kilojoule_permol_over_nanometer)
        .value
    )