ultimatepp/bazaar/plugin/sundials/include/nvector/cuda/ThreadPartitioning.hpp
koldo 59635c7080 sundials: Added package for non linear equations and differential algebraic equations solving
git-svn-id: svn://ultimatepp.org/upp/trunk@14564 f0d560ea-af0d-0410-9eb7-867de7ffcac7
2020-06-07 10:23:55 +00:00

386 lines
9.9 KiB
C++

/*
* -----------------------------------------------------------------
* Programmer(s): Slaven Peles @ LLNL
* -----------------------------------------------------------------
* SUNDIALS Copyright Start
* Copyright (c) 2002-2020, Lawrence Livermore National Security
* and Southern Methodist University.
* All rights reserved.
*
* See the top-level LICENSE and NOTICE files for details.
*
* SPDX-License-Identifier: BSD-3-Clause
* SUNDIALS Copyright End
* -----------------------------------------------------------------
*/
#ifndef _THREAD_PARTITIONING_HPP_
#define _THREAD_PARTITIONING_HPP_
#include <iostream>
#include <cuda_runtime.h>
#include <sundials/sundials_types.h>
namespace suncudavec
{
using SUNAllocFn = void* (*)(size_t);
using SUNFreeFn = void (*)(void*);
template<class T, class I>
class ThreadPartitioning
{
public:
ThreadPartitioning()
: block_(1),
grid_(1),
shMemSize_(0),
stream_(0),
bufferSize_(0),
allocfn_(nullptr),
freefn_(nullptr),
d_buffer_(nullptr),
h_buffer_(nullptr),
ownBuffer_(true)
{}
ThreadPartitioning(unsigned block,
SUNAllocFn allocfn = nullptr,
SUNFreeFn freefn = nullptr)
: block_(block),
grid_(1),
shMemSize_(0),
stream_(0),
bufferSize_(0),
allocfn_(allocfn),
freefn_(freefn),
d_buffer_(nullptr),
h_buffer_(nullptr),
ownBuffer_(true)
{}
explicit ThreadPartitioning(ThreadPartitioning<T, I>& p)
: block_(p.block_),
grid_(p.grid_),
shMemSize_(p.shMemSize_),
stream_(p.stream_),
allocfn_(p.allocfn_),
freefn_(p.freefn_)
{}
virtual ~ThreadPartitioning(){}
unsigned grid() const
{
return grid_;
}
unsigned block() const
{
return block_;
}
unsigned shmem() const
{
return shMemSize_;
}
cudaStream_t stream() const
{
return stream_;
}
unsigned int bufferSize()
{
return bufferSize_;
}
T* devBuffer()
{
return d_buffer_;
}
const T* devBuffer() const
{
return d_buffer_;
}
T* hostBuffer()
{
return h_buffer_;
}
const T* hostBuffer() const
{
return h_buffer_;
}
void setStream(const cudaStream_t& stream)
{
stream_ = stream;
}
virtual void copyFromDevBuffer(unsigned int n) const
{
std::cerr << "Trying to copy buffer from base class in "
<< "suncudavec::ThreadPartitioning::copyFromDevBuffer\n";
}
/* pure virtual functions to get the relevant partitioning information */
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize, cudaStream_t& stream) = 0;
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize) = 0;
protected:
unsigned block_;
unsigned grid_;
unsigned shMemSize_;
unsigned bufferSize_;
cudaStream_t stream_;
T* d_buffer_;
T* h_buffer_;
bool ownBuffer_;
/* custom allocators for the internal buffers */
SUNAllocFn allocfn_;
SUNFreeFn freefn_;
}; // class ThreadPartitioning
template<class T, class I>
class StreamPartitioning : public ThreadPartitioning<T, I>
{
using ThreadPartitioning<T, I>::block_;
using ThreadPartitioning<T, I>::grid_;
using ThreadPartitioning<T, I>::stream_;
public:
StreamPartitioning(I N, unsigned block, cudaStream_t stream)
: ThreadPartitioning<T, I>(block)
{
grid_ = (N + block_ - 1) / block_;
stream_ = stream;
}
StreamPartitioning(I N, unsigned block)
: ThreadPartitioning<T, I>(block)
{
grid_ = (N + block_ - 1) / block_;
}
explicit StreamPartitioning(StreamPartitioning<T, I>& p)
: ThreadPartitioning<T, I>(p)
{
}
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize,
cudaStream_t& stream)
{
block = block_;
grid = (N + block_ - 1) / block_;
shMemSize = 0;
stream = stream_;
return 0;
}
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize)
{
block = block_;
grid = (N + block_ - 1) / block_;
shMemSize = 0;
return 0;
}
}; // class StreamPartitioning
template<class T, class I=int>
class ReducePartitioning : public ThreadPartitioning<T, I>
{
using ThreadPartitioning<T, I>::block_;
using ThreadPartitioning<T, I>::grid_;
using ThreadPartitioning<T, I>::shMemSize_;
using ThreadPartitioning<T, I>::stream_;
using ThreadPartitioning<T, I>::bufferSize_;
using ThreadPartitioning<T, I>::d_buffer_;
using ThreadPartitioning<T, I>::h_buffer_;
using ThreadPartitioning<T, I>::ownBuffer_;
using ThreadPartitioning<T, I>::allocfn_;
using ThreadPartitioning<T, I>::freefn_;
public:
ReducePartitioning(I N, unsigned block,
SUNAllocFn allocfn = nullptr, SUNFreeFn freefn = nullptr)
: ThreadPartitioning<T, I>(block, allocfn, freefn)
{
grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
shMemSize_ = block_*sizeof(T);
allocateBuffer(false, allocfn != nullptr);
}
ReducePartitioning(I N, unsigned block, cudaStream_t stream,
SUNAllocFn allocfn = nullptr, SUNFreeFn freefn = nullptr)
: ThreadPartitioning<T, I>(block, allocfn, freefn)
{
grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
shMemSize_ = block_*sizeof(T);
stream_ = stream;
allocateBuffer(false, allocfn != nullptr);
}
ReducePartitioning(T *h_buffer, T *d_buffer, I N, unsigned block, cudaStream_t stream = 0)
: ThreadPartitioning<T, I>(block)
{
grid_ = (N + (block_ * 2 - 1)) / (block_ * 2);
shMemSize_ = block_*sizeof(T);
stream_ = stream;
h_buffer_ = h_buffer;
d_buffer_ = d_buffer;
ownBuffer_ = false;
}
explicit ReducePartitioning(ReducePartitioning<T, I>& p)
: ThreadPartitioning<T, I>(p)
{
shMemSize_ = p.shMemSize_;
/* if device buffer and host buffer are the same, then assume managed memory */
allocateBuffer(p.d_buffer_ == p.h_buffer_, p.allocfn_ != nullptr);
}
~ReducePartitioning()
{
cudaError_t err;
if (ownBuffer_ && bufferSize_ > 0) {
if (d_buffer_ == h_buffer_) {
/* managed memory */
if (freefn_) {
freefn_(d_buffer_);
} else {
err = cudaFree(d_buffer_);
if(err != cudaSuccess)
std::cerr << "Failed to free device vector "
<< "in suncudavec::ReducePartitioning::~ReducePartitioning "
<< "(CUDA error code " << err << ")\n";
}
d_buffer_ = h_buffer_ = nullptr;
} else {
/* unmanaged memory */
err = cudaFree(d_buffer_);
if(err != cudaSuccess)
std::cerr << "Failed to free device vector "
<< "in suncudavec::ReducePartitioning::~ReducePartitioning "
<< "(CUDA error code " << err << ")\n";
free(h_buffer_);
d_buffer_ = nullptr;
h_buffer_ = nullptr;
}
}
}
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize,
cudaStream_t& stream)
{
block = block_;
grid = (N + (block_ * 2 - 1)) / (block_ * 2);
shMemSize = block_ * sizeof(T);
stream = stream_;
return 0;
}
virtual int calcPartitioning(I N, unsigned& grid, unsigned& block, unsigned& shMemSize)
{
block = block_;
grid = (N + (block_ * 2 - 1)) / (block_ * 2);
shMemSize = block_ * sizeof(T);
return 0;
}
virtual void copyFromDevBuffer(unsigned int n) const
{
cudaError_t err;
/* If the host and device pointers are the same, then we don't need
to do a copy (this happens in the managed memory case), but we
still need to synchronize the device to adhere to the unified
memory access rules. */
if (h_buffer_ == d_buffer_) {
err = cudaStreamSynchronize(stream_);
if(err != cudaSuccess)
std::cerr << "Failed to synchronize stream in "
<< "suncudavec::ReducePartitioning::copyFromDevBuffer "
<< "(CUDA error code " << err << ")\n";
} else {
err = cudaMemcpyAsync(h_buffer_, d_buffer_, n*sizeof(T), cudaMemcpyDeviceToHost,
stream_);
if(err != cudaSuccess)
std::cerr << "Failed to copy vector from device to host in "
<< "suncudavec::ReducePartitioning::copyFromDevBuffer "
<< "(CUDA error code " << err << ")\n";
}
}
static unsigned calcBufferSize(I N, unsigned block)
{
return (N + (block * 2 - 1)) / (block * 2) * sizeof(T);
}
private:
int allocateBuffer(bool use_managed_memory = false, bool custom_allocator = false)
{
cudaError_t err;
bufferSize_ = grid_ * sizeof(T);
if (bufferSize_ == 0) return 0;
if (custom_allocator) {
d_buffer_ = static_cast<T*>(allocfn_(bufferSize_));
if(d_buffer_ == NULL)
std::cerr << "Failed to allocate managed buffer with custom allocator in "
<< "suncudavec::ReducePartitioning::allocateBuffer\n";
h_buffer_ = d_buffer_;
} else if (use_managed_memory) {
err = cudaMallocManaged((void**) &d_buffer_, bufferSize_);
if(err != cudaSuccess)
std::cerr << "Failed to allocate internal managed buffer in "
<< "suncudavec::ReducePartitioning::allocateBuffer "
<< "(CUDA error code " << err << ")\n";
h_buffer_ = d_buffer_;
} else {
h_buffer_ = static_cast<T*>(malloc(bufferSize_));
if(h_buffer_ == NULL)
std::cerr << "Failed to allocate internal host buffer in "
<< "suncudavec::ReducePartitioning::allocateBuffer\n";
err = cudaMalloc((void**) &d_buffer_, bufferSize_);
if(err != cudaSuccess)
std::cerr << "Failed to allocate internal device buffer "
<< "in suncudavec::ReducePartitioning::allocateBuffer "
<< "(CUDA error code " << err << ")\n";
}
return 0;
}
}; // class ReducePartitioning
} // namespace suncudavec
#endif // _THREAD_PARTITIONING_HPP_