Source code for MCEq.kernels

# -*- coding: utf-8 -*-
"""
:mod:`MCEq.kernels` --- calculation kernels for the forward-euler integrator
============================================================================

The module contains functions which are called by the forward-euler
integration routine :func:`MCEq.core.MCEqRun.forward_euler`.

The integration is part of these functions. The single step

.. math::

  \Phi_{i + 1} = \\left[\\boldsymbol{M}_{int} + \\frac{1}{\\rho(X_i)}\\boldsymbol{M}_{dec}\\right]
  \\cdot \\Phi_i \\cdot \\Delta X_i

with

.. math::
  \\boldsymbol{M}_{int} = (-\\boldsymbol{1} + \\boldsymbol{C}){\\boldsymbol{\\Lambda}}_{int}
  :label: int_matrix

and

.. math::
  \\boldsymbol{M}_{dec} = (-\\boldsymbol{1} + \\boldsymbol{D}){\\boldsymbol{\\Lambda}}_{dec}.
  :label: dec_matrix

The functions use different libraries for sparse and dense linear algebra (BLAS):

- The default for dense or sparse matrix representations is the function :func:`kern_numpy`.
  It uses the dot-product implementation of :mod:`numpy`. Depending on the details, your
  :mod:`numpy` installation can be already linked to some BLAS library like as ATLAS or MKL,
  what typically accelerates the calculation significantly.
- The fastest version, :func:`kern_MKL_sparse`, directly interfaces to the sparse BLAS routines
  from `Intel MKL <https://software.intel.com/en-us/intel-mkl>`_ via :mod:`ctypes`. If you have the
  MKL runtime installed, this function is recommended for most purposes.
- The GPU accelerated versions :func:`kern_CUDA_dense` and :func:`kern_CUDA_sparse` are implemented
  using the cuBLAS or cuSPARSE libraries, respectively. They should be considered as experimental or
  implementation examples if you need extremely high performance. To keep Python as the main
  programming language, these interfaces are accessed via the module :mod:`numbapro`, which is part
  of the `Anaconda Accelerate <https://store.continuum.io/cshop/accelerate/>`_ package. It is free
  for academic use.

"""
import numpy as np
from mceq_config import config


[docs]def kern_numpy(nsteps, dX, rho_inv, int_m, dec_m,
               phi, grid_idcs,
               mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None,
               prog_bar=None, fa_vars=None):
    """:mod;`numpy` implementation of forward-euler integration.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      prog_bar (object,optional): handle to :class:`ProgressBar` object
      fa_vars (dict,optional): contains variables for first interaction mode
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    grid_sol = []
    grid_step = 0

    imc = int_m
    dmc = dec_m
    dxc = dX
    ric = rho_inv
    phc = phi

    enmuloss = config['enable_muon_energy_loss']
    de = mu_egrid.size
    muloss_min_step = config['muon_energy_loss_min_step']
    lidx, nmuspec = mu_lidx_nsp
    # Accumulate at least a few g/cm2 for energy loss steps
    # to avoid numerical errors
    dXaccum = 0.

    if config['FP_precision'] == 32:
        imc = int_m.astype(np.float32)
        dmc = dec_m.astype(np.float32)
        dxc = dX.astype(np.float32)
        ric = rho_inv.astype(np.float32)
        phc = phi.astype(np.float32)

    from time import time
    start = time()
    stepper = None

    # Implmentation of first interaction mode
    if config['first_interaction_mode']:
        def stepper(step):
            if step <= fa_vars['max_step']:
                return (- fa_vars['Lambda_int'] * phc
                        + imc.dot(fa_vars['fi_switch'][step] * phc)
                        + dmc.dot(ric[step] * phc)) * dxc[step]
            else:
                # Equivalent of setting interaction matrix to 0
                return (- fa_vars['Lambda_int'] * phc
                        + dmc.dot(ric[step] * phc)) * dxc[step]
    else:
        def stepper(step):
            return (imc.dot(phc)
                    + dmc.dot(ric[step] * phc)) * dxc[step]

    for step in xrange(nsteps):
        if prog_bar and (step % 200 == 0):
            prog_bar.update(step)
        phc += stepper(step)

        dXaccum += dxc[step]

        if (enmuloss and
                (dXaccum > muloss_min_step or step == nsteps - 1)):
            for nsp in xrange(nmuspec):
                phc[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp(
                    mu_egrid, mu_egrid + mu_dEdX * dXaccum,
                    phc[lidx + de * nsp:lidx + de * (nsp + 1)])

            dXaccum = 0.

        if (grid_idcs and grid_step < len(grid_idcs)
                and grid_idcs[grid_step] == step):
            grid_sol.append(np.copy(phc))
            grid_step += 1

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))

    return phc, grid_sol


[docs]def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs,
                    mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None,
                    prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation
    of forward-euler integration.

    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    fl_pr = None
    if config['FP_precision'] == 32:
        fl_pr = np.float32
    elif config['FP_precision'] == 64:
        fl_pr = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")

    # if config['enable_muon_energyloss']:
    #     raise NotImplementedError('kern_CUDA_dense(): ' +
    #         'Energy loss not imlemented for this solver.')

    if config['enable_muon_energy_loss']:
        raise NotImplementedError('kern_CUDA_dense(): ' +
                                  'Energy loss not imlemented for this solver.')

    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from accelerate.cuda.blas import Blas
        from accelerate.cuda import cuda
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " +
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(fl_pr), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(fl_pr), stream)
    cu_curr_phi = cuda.to_device(phi.astype(fl_pr), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=fl_pr)

    from time import time
    start = time()

    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(1.0), A=cu_int_m,
                  x=cu_curr_phi, beta=fl_pr(0.0), y=cu_delta_phi)
        cubl.gemv(trans='N', m=m, n=n, alpha=fl_pr(rho_inv[step]),
                  A=cu_dec_m, x=cu_curr_phi, beta=fl_pr(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=fl_pr(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))

    return cu_curr_phi.copy_to_host(), []


class CUDASparseContext(object):
    def __init__(self, int_m, dec_m, device_id=0):

        if config['FP_precision'] == 32:
            self.fl_pr = np.float32
        elif config['FP_precision'] == 64:
            self.fl_pr = np.float64
        else:
            raise Exception(
                "CUDASparseContext(): Unknown precision specified.")
        #======================================================================
        # Setup GPU stuff and upload data to it
        #======================================================================
        try:
            from accelerate.cuda.blas import Blas
            import accelerate.cuda.sparse as cusparse
            from accelerate.cuda import cuda
        except ImportError:
            raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " +
                            "installed.\nCan not use GPU.")

        cuda.select_device(0)
        self.cuda = cuda
        self.cusp = cusparse.Sparse()
        self.cubl = Blas()
        self.set_matrices(int_m, dec_m)

    def set_matrices(self, int_m, dec_m):
        import accelerate.cuda.sparse as cusparse
        from accelerate.cuda import cuda

        self.m, self.n = int_m.shape
        self.int_m_nnz = int_m.nnz
        self.int_m_csrValA = cuda.to_device(int_m.data.astype(self.fl_pr))
        self.int_m_csrRowPtrA = cuda.to_device(int_m.indptr)
        self.int_m_csrColIndA = cuda.to_device(int_m.indices)

        self.dec_m_nnz = dec_m.nnz
        self.dec_m_csrValA = cuda.to_device(dec_m.data.astype(self.fl_pr))
        self.dec_m_csrRowPtrA = cuda.to_device(dec_m.indptr)
        self.dec_m_csrColIndA = cuda.to_device(dec_m.indices)

        self.descr = self.cusp.matdescr()
        self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
        self.cu_delta_phi = self.cuda.device_array_like(
            np.zeros(self.m, dtype=self.fl_pr))
        print np.zeros(self.m, dtype=self.fl_pr).shape

    def set_phi(self, phi):
        self.cu_curr_phi = self.cuda.to_device(phi.astype(self.fl_pr))

    def get_phi(self):
        return self.cu_curr_phi.copy_to_host()

    def do_step(self, rho_inv, dX):

        self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.int_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(1.0),
                        csrVal=self.int_m_csrValA,
                        csrRowPtr=self.int_m_csrRowPtrA,
                        csrColInd=self.int_m_csrColIndA,
                        x=self.cu_curr_phi, beta=self.fl_pr(0.0), y=self.cu_delta_phi)
        # print np.sum(cu_curr_phi.copy_to_host())
        self.cusp.csrmv(trans='N', m=self.m, n=self.n, nnz=self.dec_m_nnz,
                        descr=self.descr,
                        alpha=self.fl_pr(rho_inv),
                        csrVal=self.dec_m_csrValA,
                        csrRowPtr=self.dec_m_csrRowPtrA,
                        csrColInd=self.dec_m_csrColIndA,
                        x=self.cu_curr_phi, beta=self.fl_pr(1.0), y=self.cu_delta_phi)
        self.cubl.axpy(alpha=self.fl_pr(
            dX), x=self.cu_delta_phi, y=self.cu_curr_phi)

# class CUDASparseContextSkcuda(object):
#
#     *Doesn't work right now because of crashes and missing wrappers for cuSPARSE
#
#     def __init__(self, int_m, dec_m, device_id=0):

#         #=======================================================================
#         # Setup GPU stuff and upload data to it
#         #=======================================================================
#         # try:
#         import pycuda.autoinit
#         import pycuda.gpuarray as gpuarray
#         from skcuda import cusparse, cublas

#         # except ImportError:
#         #     raise Exception("kern_CUDA_sparse(): Numbapro CUDA libaries not " +
#         #                     "installed.\nCan not use GPU.")

#         # cuda.select_device(0)
#         self.cusp_handle = cusparse.cusparseCreate()
#         self.cubl_handle = cublas.cublasCreate()
#         self.gpua = gpuarray

#         if config['FP_precision'] == 32:
#             self.fl_pr = np.float32
#             self.mv = cusparse._libcusparse.cusparseScsrmv
#             self.axpy = cublas._libcublas.cublasSaxpy
#         elif config['FP_precision'] == 64:
#             self.fl_pr = np.float64
#             self.mv = cusparse._libcusparse.cusparseDcsrmv
#             self.axpy = cublas._libcublas.cublasDaxpy
#         else:
#             raise Exception("CUDASparseContext(): Unknown precision specified.")

#         self.set_matrices(int_m, dec_m)

#     def set_matrices(self, int_m, dec_m):
#         from skcuda import cusparse

#         self.m, self.n = int_m.shape
#         self.int_m_nnz = int_m.nnz
#         self.int_m_csrValA = self.gpua.to_gpu(int_m.data.astype(self.fl_pr))
#         self.int_m_csrRowPtrA = self.gpua.to_gpu(int_m.indptr)
#         self.int_m_csrColIndA = self.gpua.to_gpu(int_m.indices)

#         self.dec_m_nnz = dec_m.nnz
#         self.dec_m_csrValA = self.gpua.to_gpu(dec_m.data.astype(self.fl_pr))
#         self.dec_m_csrRowPtrA = self.gpua.to_gpu(dec_m.indptr)
#         self.dec_m_csrColIndA = self.gpua.to_gpu(dec_m.indices)

#         self.descr = cusparse.cusparseMatDescr()
#         self.descr.indexbase = cusparse.CUSPARSE_INDEX_BASE_ZERO
#         self.cu_delta_phi = self.gpua.empty(self.m,dtype=self.fl_pr)

#     def set_phi(self, phi):
#         self.cu_curr_phi = self.gpua.to_gpu(phi.astype(self.fl_pr))

#     def get_phi(self):
#         return self.cu_curr_phi.get()

#     def do_step(self, rho_inv, dX):
#         from skcuda import cusparse, cublas

#         cusparse._libcusparse.cusparseScsrmv(self.cusp_handle,
#                    trans='n', m=self.m, n=self.n, nnz=self.int_m_nnz,
#                    alpha=self.fl_pr(1.0), descrA=self.descr,
#                    csrVal=self.int_m_csrValA.gpudata,
#                    csrRowPtr=self.int_m_csrRowPtrA.gpudata,
#                    csrColInd=self.int_m_csrColIndA.gpudata,
#                    x=self.cu_curr_phi.gpudata, beta=self.fl_pr(0.0), y=self.cu_delta_phi.gpudata)
#         # # # print np.sum(cu_curr_phi.copy_to_host())
#         cusparse._libcusparse.cusparseScsrmv(self.cusp_handle, trans='N', m=self.m, n=self.n,
#                 nnz=self.dec_m_nnz,
#                 descr=self.descr,
#                 alpha=self.fl_pr(rho_inv),
#                 csrVal=self.dec_m_csrValA.gpudata,
#                 csrRowPtr=self.dec_m_csrRowPtrA.gpudata,
#                 csrColInd=self.dec_m_csrColIndA.gpudata,
#                 x=self.cu_curr_phi.gpudata, beta=self.fl_pr(1.0),
#                 y=self.cu_delta_phi.gpudata)
#         cublas.cublasSaxpy(self.cubl_handle, self.cu_curr_phi.size,
#                            self.fl_pr(dX),
# self.cu_delta_phi.gpudata, 1, self.cu_curr_phi.gpudata, 1)


[docs]def kern_CUDA_sparse(nsteps, dX, rho_inv, context, phi, grid_idcs,
                     mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None,
                     prog_bar=None):
    """`NVIDIA CUDA cuSPARSE <https://developer.nvidia.com/cusparse>`_ implementation
    of forward-euler integration.

    Function requires a working :mod:`accelerate` installation.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    c = context
    c.set_phi(phi)

    enmuloss = config['enable_muon_energy_loss']
    de = mu_egrid.size
    mu_egrid = mu_egrid.astype(c.fl_pr)
    muloss_min_step = config['muon_energy_loss_min_step']
    lidx, nmuspec = mu_lidx_nsp

    # Accumulate at least a few g/cm2 for energy loss steps
    # to avoid numerical errors
    dXaccum = 0.

    grid_step = 0
    grid_sol = []

    from time import time
    start = time()

    for step in xrange(nsteps):
        if prog_bar and (step % 5 == 0):
            prog_bar.update(step)

        c.do_step(rho_inv[step], dX[step])

        dXaccum += dX[step]

        if enmuloss and (dXaccum > muloss_min_step or step == nsteps - 1):
            # Download current solution vector to host
            phc = c.get_phi()
            for nsp in xrange(nmuspec):
                phc[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp(
                    mu_egrid, mu_egrid + mu_dEdX * dXaccum,
                    phc[lidx + de * nsp:lidx + de * (nsp + 1)])
            # Upload changed vector back..
            c.set_phi(phc)
            dXaccum = 0.

        if (grid_idcs and grid_step < len(grid_idcs)
                and grid_idcs[grid_step] == step):
            grid_sol.append(c.get_phi())
            grid_step += 1

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))

    return c.get_phi(), grid_sol


[docs]def kern_MKL_sparse(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs,
                    mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None,
                    prog_bar=None):
    """`Intel MKL sparse BLAS
    <https://software.intel.com/en-us/articles/intel-mkl-sparse-blas-overview?language=en>`_
    implementation of forward-euler integration.

    Function requires that the path to the MKL runtime library ``libmkl_rt.[so/dylib]``
    defined in the config file.

    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
      grid_idcs (list): indices at which longitudinal solutions have to be saved.
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """

    from ctypes import cdll, c_int, c_char, POINTER, byref

    try:
        mkl = cdll.LoadLibrary(config['MKL_path'])
    except OSError:
        raise Exception("kern_MKL_sparse(): MKL runtime library not " +
                        "found. Please check path.")

    gemv = None
    axpy = None
    np_fl = None
    if config['FP_precision'] == 32:
        from ctypes import c_float as fl_pr
        # sparse CSR-matrix x dense vector
        gemv = mkl.mkl_scsrmv
        # dense vector + dense vector
        axpy = mkl.cblas_saxpy
        np_fl = np.float32
    elif config['FP_precision'] == 64:
        from ctypes import c_double as fl_pr
        # sparse CSR-matrix x dense vector
        gemv = mkl.mkl_dcsrmv
        # dense vector + dense vector
        axpy = mkl.cblas_daxpy
        np_fl = np.float64
    else:
        raise Exception("kern_MKL_sparse(): Unknown precision specified.")

    # Set number of threads
    mkl.mkl_set_num_threads(byref(c_int(config['MKL_threads'])))

    # Prepare CTYPES pointers for MKL sparse CSR BLAS
    int_m_data = int_m.data.ctypes.data_as(POINTER(fl_pr))
    int_m_ci = int_m.indices.ctypes.data_as(POINTER(c_int))
    int_m_pb = int_m.indptr[:-1].ctypes.data_as(POINTER(c_int))
    int_m_pe = int_m.indptr[1:].ctypes.data_as(POINTER(c_int))

    dec_m_data = dec_m.data.ctypes.data_as(POINTER(fl_pr))
    dec_m_ci = dec_m.indices.ctypes.data_as(POINTER(c_int))
    dec_m_pb = dec_m.indptr[:-1].ctypes.data_as(POINTER(c_int))
    dec_m_pe = dec_m.indptr[1:].ctypes.data_as(POINTER(c_int))

    npphi = np.copy(phi).astype(np_fl)
    phi = npphi.ctypes.data_as(POINTER(fl_pr))
    npdelta_phi = np.zeros_like(npphi)
    delta_phi = npdelta_phi.ctypes.data_as(POINTER(fl_pr))

    trans = c_char('n')
    npmatd = np.chararray(6)
    npmatd[0] = 'G'
    npmatd[3] = 'C'
    matdsc = npmatd.ctypes.data_as(POINTER(c_char))
    m = c_int(int_m.shape[0])
    cdzero = fl_pr(0.)
    cdone = fl_pr(1.)
    cione = c_int(1)

    enmuloss = config['enable_muon_energy_loss']
    de = mu_egrid.size
    mu_egrid = mu_egrid.astype(np_fl)
    mu_dEdX = mu_dEdX.astype(np_fl)
    muloss_min_step = config['muon_energy_loss_min_step']
    lidx, nmuspec = mu_lidx_nsp
    # Accumulate at least a few g/cm2 for energy loss steps
    # to avoid numerical errors
    dXaccum = 0.

    grid_step = 0
    grid_sol = []

    from time import time
    start = time()

    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)

        # delta_phi = int_m.dot(phi)
        gemv(byref(trans), byref(m), byref(m),
             byref(cdone), matdsc,
             int_m_data, int_m_ci, int_m_pb, int_m_pe,
             phi, byref(cdzero), delta_phi)
        # delta_phi = rho_inv * dec_m.dot(phi) + delta_phi
        gemv(byref(trans), byref(m), byref(m),
             byref(fl_pr(rho_inv[step])), matdsc,
             dec_m_data, dec_m_ci, dec_m_pb, dec_m_pe,
             phi, byref(cdone), delta_phi)
        # phi = delta_phi * dX + phi
        axpy(m, fl_pr(dX[step]),
             delta_phi, cione, phi, cione)

        dXaccum += dX[step]

        if (enmuloss and
                (dXaccum > muloss_min_step or step == nsteps - 1)):
            for nsp in xrange(nmuspec):
                npphi[lidx + de * nsp: lidx + de * (nsp + 1)] = np.interp(
                    mu_egrid, mu_egrid + mu_dEdX * dXaccum,
                    npphi[lidx + de * nsp:lidx + de * (nsp + 1)])

            dXaccum = 0.

        if (grid_idcs and grid_step < len(grid_idcs)
                and grid_idcs[grid_step] == step):
            grid_sol.append(np.copy(npphi))
            grid_step += 1

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))

    return npphi, grid_sol


[docs]def kern_XeonPHI_sparse(nsteps, dX, rho_inv, int_m, dec_m,
                        phi, grid_idcs,
                        mu_egrid=None, mu_dEdX=None, mu_lidx_nsp=None,
                        prog_bar=None):
    """Experimental Xeon Phi support using pyMIC library.
    """

    import sys
    import os
    sys.path.append(os.path.join(os.path.expanduser("~"), 'work/git/pymic'))
    import pymic as mic

    # load the library with the kernel function (on the target)

    device = mic.devices[1]
    base = os.path.dirname(os.path.abspath(__file__))
    library = device.load_library(os.path.join(base, "../Xeon_Phi/libmceq.so"))

    # use the default stream

    stream = device.get_default_stream()

    # Prepare CTYPES pointers for MKL sparse CSR BLAS
    mic_int_m_data = stream.bind(int_m.data)
    mic_int_m_ci = stream.bind(int_m.indices)
    mic_int_m_pb = stream.bind(int_m.indptr[:-1])
    mic_int_m_pe = stream.bind(int_m.indptr[1:])

    mic_dec_m_data = stream.bind(dec_m.data)
    mic_dec_m_ci = stream.bind(dec_m.indices)
    mic_dec_m_pb = stream.bind(dec_m.indptr[:-1])
    mic_dec_m_pe = stream.bind(dec_m.indptr[1:])

    npphi = np.copy(phi)
    mic_phi = stream.bind(npphi)
    npdelta_phi = np.zeros_like(npphi)
    mic_delta_phi = stream.bind(npdelta_phi)
    mic_dX = stream.bind(dX)
    mic_rho_inv = stream.bind(rho_inv)

    m = int_m.shape[0]

    from time import time
    start = time()

    stream.invoke(library.mceq_kernel,
                  m, nsteps,
                  mic_phi, mic_delta_phi,
                  mic_rho_inv, mic_dX,
                  mic_int_m_data, mic_int_m_ci,
                  mic_int_m_pb, mic_int_m_pe,
                  mic_dec_m_data, mic_dec_m_ci,
                  mic_dec_m_pb, mic_dec_m_pe)

    stream.sync()

    print "Performance: {0:6.2f}ms/iteration".format(1e3 * (time() - start) / float(nsteps))

    mic_phi.update_host()
    stream.sync()

    return mic_phi.array, []