Source code for MCEq.kernels

# -*- coding: utf-8 -*-
"""
:mod:`MCEq.kernels` --- calculation kernels for the forward-euler integrator
============================================================================

The module contains functions which are called by the forward-euler
integration routine :func:`MCEq.core.MCEqRun.forward_euler`.

The integration is part of these functions. The single step

.. math::

  \Phi_{i + 1} = \\left[\\boldsymbol{M}_{int} + \\frac{1}{\\rho(X_i)}\\boldsymbol{M}_{dec}\\right]
  \\cdot \\Phi_i \\cdot \\Delta X_i

with

.. math::
  \\boldsymbol{M}_{int} = (-\\boldsymbol{1} + \\boldsymbol{C}){\\boldsymbol{\\Lambda}}_{int}
  :label: int_matrix

and

.. math::
  \\boldsymbol{M}_{dec} = (-\\boldsymbol{1} + \\boldsymbol{D}){\\boldsymbol{\\Lambda}}_{dec}.
  :label: dec_matrix

The functions use different libraries for sparse and dense linear algebra (BLAS):

- The default for dense or sparse matrix representations is the function :func:`kern_numpy`.
  It uses the dot-product implementation of :mod:`numpy`. Depending on the details, your
  :mod:`numpy` installation can be already linked to some BLAS library like as ATLAS or MKL,
  what typically accelerates the calculation significantly.
- The fastest version, :func:`kern_MKL_sparse`, directly interfaces to the sparse BLAS routines
  from `Intel MKL <https://software.intel.com/en-us/intel-mkl>`_ via :mod:`ctypes`. If you have the
  MKL runtime installed, this function is recommended for most purposes.
- The GPU accelerated versions :func:`kern_CUDA_dense` and :func:`kern_CUDA_sparse` are implemented
  using the cuBLAS or cuSPARSE libraries, respectively. They should be considered as experimental or
  implementation examples if you need extremely high performance. To keep Python as the main
  programming language, these interfaces are accessed via the module :mod:`numbapro`, which is part
  of the `Anaconda Accelerate <https://store.continuum.io/cshop/accelerate/>`_ package. It is free
  for academic use.

"""
import numpy as np
from mceq_config import config


[docs]def kern_numpy(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None, fa_vars=None): """:mod;`numpy` implementation of forward-euler integration. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object fa_vars (dict,optional): contains variables for first interaction mode Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ grid_sol = [] grid_step = 0 imc = int_m dmc = dec_m dxc = dX ric = rho_inv phc = phi enmuloss = config['enable_muon_energy_loss'] de = mu_egrid.size muloss_min_step = config['muon_energy_loss_min_step'] lidx, nmuspec = mu_lidx_nsp # Accumulate at least a few g/cm2 for energy loss steps # to avoid numerical errors dXaccum = 0. if config['FP_precision'] == 32: imc = int_m.astype(np.float32) dmc = dec_m.astype(np.float32) dxc = dX.astype(np.float32) ric = rho_inv.astype(np.float32) phc = phi.astype(np.float32) from time import time start = time() stepper = None # Implmentation of first interaction mode if config['first_interaction_mode']: def stepper(step): if step <= fa_vars['max_step']: return (- fa_vars['Lambda_int'] * phc + imc.dot(fa_vars['fi_switch'][step] * phc) + dmc.dot(ric[step] * phc)) * dxc[step] else: # Equivalent of setting interaction matrix to 0 return (- fa_vars['Lambda_int'] * phc + dmc.dot(ric[step] * phc)) * dxc[step] else: def stepper(step): return (imc.dot(phc) + dmc.dot(ric[step] * phc)) * dxc[step] for step in xrange(nsteps): if prog_bar and (step % 200 == 0): prog_bar.update(step) phc += stepper(step) dXaccum += dxc[step] if (enmuloss and (dXaccum > muloss_min_step or step == nsteps - 1)): for nsp in xrange(nmuspec): phc[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp( mu_egrid, mu_egrid + mu_dEdX * dXaccum, phc[lidx + de * nsp:lidx + de * (nsp + 1)]) dXaccum = 0. if (grid_idcs and grid_step < len(grid_idcs) and grid_idcs[grid_step] == step): grid_sol.append(np.copy(phc)) grid_step += 1 print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return phc, grid_sol
[docs]def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation of forward-euler integration. Function requires a working :mod:`numbapro` installation. It is typically slower compared to :func:`kern_MKL_sparse` but it depends on your hardware. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ fl_pr = None if config['FP_precision'] == 32: fl_pr = np.float32 elif config['FP_precision'] == 64: fl_pr = np.float64 else: raise Exception("kern_CUDA_dense(): Unknown precision specified.") # if config['enable_muon_energyloss']: # raise NotImplementedError('kern_CUDA_dense(): ' + # 'Energy loss not imlemented for this solver.') if config['enable_muon_energy_loss']: raise NotImplementedError('kern_CUDA_dense(): ' + 'Energy loss not imlemented for this solver.') #======================================================================= # Setup GPU stuff and upload data to it #======================================================================= try: from accelerate.cuda.blas import Blas from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cubl = Blas() m, n = int_m.shape stream = cuda.stream() cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream) cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream) cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream) cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr) from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(1.0), A=cu_int_m, x=cu_curr_phi, beta=fl_pr(0.0), y=cu_delta_phi) cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(rho_inv[step]), A=cu_dec_m, x=cu_curr_phi, beta=fl_pr(1.0), y=cu_delta_phi) cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi) print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return cu_curr_phi.copy_to_host(), []
class CUDASparseContext(object): def __init__(self, int_m, dec_m, device_id=0): if config['FP_precision'] == 32: self.fl_pr = np.float32 elif config['FP_precision'] == 64: self.fl_pr = np.float64 else: raise Exception( "CUDASparseContext(): Unknown precision specified.") #====================================================================== # Setup GPU stuff and upload data to it #====================================================================== try: from accelerate.cuda.blas import Blas import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda except ImportError: raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + "installed.\nCan not use GPU.") cuda.select_device(0) self.cuda = cuda self.cusp = cusparse.Sparse() self.cubl = Blas() self.set_matrices(int_m, dec_m) def set_matrices(self, int_m, dec_m): import accelerate.cuda.sparse as cusparse from accelerate.cuda import cuda self.m, self.n = int_m.shape self.int_m_nnz = int_m.nnz self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr)) self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr) self.int_m_csrColIndA = cuda.to_device(int_m.indices) self.dec_m_nnz = dec_m.nnz self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr)) self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr) self.dec_m_csrColIndA = cuda.to_device(dec_m.indices) self.descr = self.cusp.matdescr() self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO self.cu_delta_phi = self.cuda.device_array_like( np.zeros(self.m, dtype=self.fl_pr)) print np.zeros(self.m, dtype=self.fl_pr).shape def set_phi(self, phi): self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr)) def get_phi(self): return self.cu_curr_phi.copy_to_host() def do_step(self, rho_inv, dX): self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.int_m_nnz, descr=self.descr, alpha=self.fl_pr(1.0), csrVal=self.int_m_csrValA, csrRowPtr=self.int_m_csrRowPtrA, csrColInd=self.int_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(0.0), y=self.cu_delta_phi) # print np.sum(cu_curr_phi.copy_to_host()) self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.dec_m_nnz, descr=self.descr, alpha=self.fl_pr(rho_inv), csrVal=self.dec_m_csrValA, csrRowPtr=self.dec_m_csrRowPtrA, csrColInd=self.dec_m_csrColIndA, x=self.cu_curr_phi, beta=self.fl_pr(1.0), y=self.cu_delta_phi) self.cubl.axpy(alpha=self.fl_pr( dX), x=self.cu_delta_phi, y=self.cu_curr_phi) # class CUDASparseContextSkcuda(object): # # *Doesn't work right now because of crashes and missing wrappers for cuSPARSE # # def __init__(self, int_m, dec_m, device_id=0): # #======================================================================= # # Setup GPU stuff and upload data to it # #======================================================================= # # try: # import pycuda.autoinit # import pycuda.gpuarray as gpuarray # from skcuda import cusparse, cublas # # except ImportError: # # raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " + # # "installed.\nCan not use GPU.") # # cuda.select_device(0) # self.cusp_handle = cusparse.cusparseCreate() # self.cubl_handle = cublas.cublasCreate() # self.gpua = gpuarray # if config['FP_precision'] == 32: # self.fl_pr = np.float32 # self.mv = cusparse._libcusparse.cusparseScsrmv # self.axpy = cublas._libcublas.cublasSaxpy # elif config['FP_precision'] == 64: # self.fl_pr = np.float64 # self.mv = cusparse._libcusparse.cusparseDcsrmv # self.axpy = cublas._libcublas.cublasDaxpy # else: # raise Exception("CUDASparseContext(): Unknown precision specified.") # self.set_matrices(int_m, dec_m) # def set_matrices(self, int_m, dec_m): # from skcuda import cusparse # self.m, self.n = int_m.shape # self.int_m_nnz = int_m.nnz # self.int_m_csrValA = self.gpua.to_gpu(int_m.data.astype(self.fl_pr)) # self.int_m_csrRowPtrA = self.gpua.to_gpu(int_m.indptr) # self.int_m_csrColIndA = self.gpua.to_gpu(int_m.indices) # self.dec_m_nnz = dec_m.nnz # self.dec_m_csrValA = self.gpua.to_gpu(dec_m.data.astype(self.fl_pr)) # self.dec_m_csrRowPtrA = self.gpua.to_gpu(dec_m.indptr) # self.dec_m_csrColIndA = self.gpua.to_gpu(dec_m.indices) # self.descr = cusparse.cusparseMatDescr() # self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO # self.cu_delta_phi = self.gpua.empty(self.m,dtype=self.fl_pr) # def set_phi(self, phi): # self.cu_curr_phi = self.gpua.to_gpu(phi.astype(self.fl_pr)) # def get_phi(self): # return self.cu_curr_phi.get() # def do_step(self, rho_inv, dX): # from skcuda import cusparse, cublas # cusparse._libcusparse.cusparseScsrmv(self.cusp_handle, # trans='n', m=self.m, n=self.n, nnz=self.int_m_nnz, # alpha=self.fl_pr(1.0), descrA=self.descr, # csrVal=self.int_m_csrValA.gpudata, # csrRowPtr=self.int_m_csrRowPtrA.gpudata, # csrColInd=self.int_m_csrColIndA.gpudata, # x=self.cu_curr_phi.gpudata, beta=self.fl_pr(0.0), y=self.cu_delta_phi.gpudata) # # # # print np.sum(cu_curr_phi.copy_to_host()) # cusparse._libcusparse.cusparseScsrmv(self.cusp_handle, trans='N', m=self.m, n=self.n, # nnz=self.dec_m_nnz, # descr=self.descr, # alpha=self.fl_pr(rho_inv), # csrVal=self.dec_m_csrValA.gpudata, # csrRowPtr=self.dec_m_csrRowPtrA.gpudata, # csrColInd=self.dec_m_csrColIndA.gpudata, # x=self.cu_curr_phi.gpudata, beta=self.fl_pr(1.0), # y=self.cu_delta_phi.gpudata) # cublas.cublasSaxpy(self.cubl_handle, self.cu_curr_phi.size, # self.fl_pr(dX), # self.cu_delta_phi.gpudata, 1, self.cu_curr_phi.gpudata, 1)
[docs]def kern_CUDA_sparse(nsteps, dX, rho_inv, context, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation of forward-euler integration. Function requires a working :mod:`accelerate` installation. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ c = context c.set_phi(phi) enmuloss = config['enable_muon_energy_loss'] de = mu_egrid.size mu_egrid = mu_egrid.astype(c.fl_pr) muloss_min_step = config['muon_energy_loss_min_step'] lidx, nmuspec = mu_lidx_nsp # Accumulate at least a few g/cm2 for energy loss steps # to avoid numerical errors dXaccum = 0. grid_step = 0 grid_sol = [] from time import time start = time() for step in xrange(nsteps): if prog_bar and (step % 5 == 0): prog_bar.update(step) c.do_step(rho_inv[step], dX[step]) dXaccum += dX[step] if enmuloss and (dXaccum > muloss_min_step or step == nsteps - 1): # Download current solution vector to host phc = c.get_phi() for nsp in xrange(nmuspec): phc[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp( mu_egrid, mu_egrid + mu_dEdX * dXaccum, phc[lidx + de * nsp:lidx + de * (nsp + 1)]) # Upload changed vector back.. c.set_phi(phc) dXaccum = 0. if (grid_idcs and grid_step < len(grid_idcs) and grid_idcs[grid_step] == step): grid_sol.append(c.get_phi()) grid_step += 1 print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return c.get_phi(), grid_sol
[docs]def kern_MKL_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """`Intel MKL sparse BLAS <https://software.intel.com/en-us/articles/intel-mkl-sparse-blas-overview?language=en>`_ implementation of forward-euler integration. Function requires that the path to the MKL runtime library ``libmkl_rt.[so/dylib]`` defined in the config file. Args: nsteps (int): number of integration steps dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2 rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}` int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation phi (numpy.array): initial state vector :math:`\\Phi(X_0)` grid_idcs (list): indices at which longitudinal solutions have to be saved. prog_bar (object,optional): handle to :class:`ProgressBar` object Returns: numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration """ from ctypes import cdll, c_int, c_char, POINTER, byref try: mkl = cdll.LoadLibrary(config['MKL_path']) except OSError: raise Exception("kern_MKL_sparse(): MKL runtime library not " + "found. Please check path.") gemv = None axpy = None np_fl = None if config['FP_precision'] == 32: from ctypes import c_float as fl_pr # sparse CSR-matrix x dense vector gemv = mkl.mkl_scsrmv # dense vector + dense vector axpy = mkl.cblas_saxpy np_fl = np.float32 elif config['FP_precision'] == 64: from ctypes import c_double as fl_pr # sparse CSR-matrix x dense vector gemv = mkl.mkl_dcsrmv # dense vector + dense vector axpy = mkl.cblas_daxpy np_fl = np.float64 else: raise Exception("kern_MKL_sparse(): Unknown precision specified.") # Set number of threads mkl.mkl_set_num_threads(byref(c_int(config['MKL_threads']))) # Prepare CTYPES pointers for MKL sparse CSR BLAS int_m_data = int_m.data.ctypes.data_as(POINTER(fl_pr)) int_m_ci = int_m.indices.ctypes.data_as(POINTER(c_int)) int_m_pb = int_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) int_m_pe = int_m.indptr[1:].ctypes.data_as(POINTER(c_int)) dec_m_data = dec_m.data.ctypes.data_as(POINTER(fl_pr)) dec_m_ci = dec_m.indices.ctypes.data_as(POINTER(c_int)) dec_m_pb = dec_m.indptr[:-1].ctypes.data_as(POINTER(c_int)) dec_m_pe = dec_m.indptr[1:].ctypes.data_as(POINTER(c_int)) npphi = np.copy(phi).astype(np_fl) phi = npphi.ctypes.data_as(POINTER(fl_pr)) npdelta_phi = np.zeros_like(npphi) delta_phi = npdelta_phi.ctypes.data_as(POINTER(fl_pr)) trans = c_char('n') npmatd = np.chararray(6) npmatd[0] = 'G' npmatd[3] = 'C' matdsc = npmatd.ctypes.data_as(POINTER(c_char)) m = c_int(int_m.shape[0]) cdzero = fl_pr(0.) cdone = fl_pr(1.) cione = c_int(1) enmuloss = config['enable_muon_energy_loss'] de = mu_egrid.size mu_egrid = mu_egrid.astype(np_fl) mu_dEdX = mu_dEdX.astype(np_fl) muloss_min_step = config['muon_energy_loss_min_step'] lidx, nmuspec = mu_lidx_nsp # Accumulate at least a few g/cm2 for energy loss steps # to avoid numerical errors dXaccum = 0. grid_step = 0 grid_sol = [] from time import time start = time() for step in xrange(nsteps): if prog_bar: prog_bar.update(step) # delta_phi = int_m.dot(phi) gemv(byref(trans), byref(m), byref(m), byref(cdone), matdsc, int_m_data, int_m_ci, int_m_pb, int_m_pe, phi, byref(cdzero), delta_phi) # delta_phi = rho_inv * dec_m.dot(phi) + delta_phi gemv(byref(trans), byref(m), byref(m), byref(fl_pr(rho_inv[step])), matdsc, dec_m_data, dec_m_ci, dec_m_pb, dec_m_pe, phi, byref(cdone), delta_phi) # phi = delta_phi * dX + phi axpy(m, fl_pr(dX[step]), delta_phi, cione, phi, cione) dXaccum += dX[step] if (enmuloss and (dXaccum > muloss_min_step or step == nsteps - 1)): for nsp in xrange(nmuspec): npphi[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp( mu_egrid, mu_egrid + mu_dEdX * dXaccum, npphi[lidx + de * nsp:lidx + de * (nsp + 1)]) dXaccum = 0. if (grid_idcs and grid_step < len(grid_idcs) and grid_idcs[grid_step] == step): grid_sol.append(np.copy(npphi)) grid_step += 1 print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) return npphi, grid_sol
[docs]def kern_XeonPHI_sparse(nsteps, dX, rho_inv, int_m, dec_m, phi, grid_idcs, mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None, prog_bar=None): """Experimental Xeon Phi support using pyMIC library. """ import sys import os sys.path.append(os.path.join(os.path.expanduser("~"), 'work/git/pymic')) import pymic as mic # load the library with the kernel function (on the target) device = mic.devices[1] base = os.path.dirname(os.path.abspath(__file__)) library = device.load_library(os.path.join(base, "../Xeon_Phi/libmceq.so")) # use the default stream stream = device.get_default_stream() # Prepare CTYPES pointers for MKL sparse CSR BLAS mic_int_m_data = stream.bind(int_m.data) mic_int_m_ci = stream.bind(int_m.indices) mic_int_m_pb = stream.bind(int_m.indptr[:-1]) mic_int_m_pe = stream.bind(int_m.indptr[1:]) mic_dec_m_data = stream.bind(dec_m.data) mic_dec_m_ci = stream.bind(dec_m.indices) mic_dec_m_pb = stream.bind(dec_m.indptr[:-1]) mic_dec_m_pe = stream.bind(dec_m.indptr[1:]) npphi = np.copy(phi) mic_phi = stream.bind(npphi) npdelta_phi = np.zeros_like(npphi) mic_delta_phi = stream.bind(npdelta_phi) mic_dX = stream.bind(dX) mic_rho_inv = stream.bind(rho_inv) m = int_m.shape[0] from time import time start = time() stream.invoke(library.mceq_kernel, m, nsteps, mic_phi, mic_delta_phi, mic_rho_inv, mic_dX, mic_int_m_data, mic_int_m_ci, mic_int_m_pb, mic_int_m_pe, mic_dec_m_data, mic_dec_m_ci, mic_dec_m_pb, mic_dec_m_pe) stream.sync() print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps)) mic_phi.update_host() stream.sync() return mic_phi.array, []