mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-06-04 22:03:29 -06:00
371 lines
9.5 KiB
C++
371 lines
9.5 KiB
C++
/*
|
|
* -----------------------------------------------------------------
|
|
* Programmer(s): Slaven Peles, and Cody J. Balos @ LLNL
|
|
* -----------------------------------------------------------------
|
|
* SUNDIALS Copyright Start
|
|
* Copyright (c) 2002-2020, Lawrence Livermore National Security
|
|
* and Southern Methodist University.
|
|
* All rights reserved.
|
|
*
|
|
* See the top-level LICENSE and NOTICE files for details.
|
|
*
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
* SUNDIALS Copyright End
|
|
* -----------------------------------------------------------------
|
|
*/
|
|
|
|
|
|
/*
|
|
* Vector class
|
|
*
|
|
* Manages vector data layout for CUDA implementation of N_Vector.
|
|
*
|
|
*/
|
|
|
|
#ifndef _NVECTOR_CUDA_HPP_
|
|
#define _NVECTOR_CUDA_HPP_
|
|
|
|
#include <cstdlib>
|
|
#include <iostream>
|
|
|
|
#include <cuda_runtime.h>
|
|
#include "ThreadPartitioning.hpp"
|
|
|
|
#include <nvector/nvector_cuda.h>
|
|
#include <sundials/sundials_config.h>
|
|
|
|
namespace suncudavec
|
|
{
|
|
|
|
template <typename T, typename I>
|
|
class Vector : public _N_VectorContent_Cuda
|
|
{
|
|
|
|
public:
|
|
Vector(I N,
|
|
bool use_managed_memory = false, bool allocate_data = true,
|
|
T* const h_vec = nullptr, T* const d_vec = nullptr)
|
|
: size_(N),
|
|
mem_size_(N*sizeof(T)),
|
|
ownPartitioning_(true),
|
|
ownData_(allocate_data),
|
|
managed_mem_(use_managed_memory),
|
|
allocfn_(nullptr),
|
|
freefn_(nullptr),
|
|
h_vec_(h_vec),
|
|
d_vec_(d_vec)
|
|
{
|
|
// Set partitioning
|
|
partStream_ = new StreamPartitioning<T, I>(N, 256);
|
|
partReduce_ = new ReducePartitioning<T, I>(N, 256);
|
|
|
|
// Allocate data arrays
|
|
if (allocate_data)
|
|
allocate();
|
|
}
|
|
|
|
Vector(I N, cudaStream_t stream,
|
|
bool use_managed_memory = false, bool allocate_data = true,
|
|
T* const h_vec = nullptr, T* const d_vec = nullptr)
|
|
: size_(N),
|
|
mem_size_(N*sizeof(T)),
|
|
ownPartitioning_(true),
|
|
ownData_(allocate_data),
|
|
managed_mem_(use_managed_memory),
|
|
allocfn_(nullptr),
|
|
freefn_(nullptr),
|
|
h_vec_(h_vec),
|
|
d_vec_(d_vec)
|
|
{
|
|
// Set partitioning
|
|
partStream_ = new StreamPartitioning<T, I>(N, 256, stream);
|
|
partReduce_ = new ReducePartitioning<T, I>(N, 256, stream);
|
|
|
|
// Allocate data arrays
|
|
if (allocate_data)
|
|
allocate();
|
|
}
|
|
|
|
Vector(I N,
|
|
SUNAllocFn allocfn, SUNFreeFn freefn,
|
|
bool allocate_data = true)
|
|
: size_(N),
|
|
mem_size_(N*sizeof(T)),
|
|
ownPartitioning_(true),
|
|
ownData_(allocate_data),
|
|
managed_mem_(true),
|
|
allocfn_(allocfn),
|
|
freefn_(freefn),
|
|
h_vec_(nullptr),
|
|
d_vec_(nullptr)
|
|
{
|
|
// Set partitioning
|
|
partStream_ = new StreamPartitioning<T, I>(N, 256);
|
|
partReduce_ = new ReducePartitioning<T, I>(N, 256, allocfn, freefn);
|
|
|
|
// Allocate data arrays
|
|
if (allocate_data)
|
|
allocate();
|
|
}
|
|
|
|
Vector(I N, cudaStream_t stream,
|
|
SUNAllocFn allocfn, SUNFreeFn freefn,
|
|
bool allocate_data = true)
|
|
: size_(N),
|
|
mem_size_(N*sizeof(T)),
|
|
ownPartitioning_(true),
|
|
ownData_(allocate_data),
|
|
managed_mem_(true),
|
|
allocfn_(allocfn),
|
|
freefn_(freefn),
|
|
h_vec_(nullptr),
|
|
d_vec_(nullptr)
|
|
{
|
|
// Set partitioning
|
|
partStream_ = new StreamPartitioning<T, I>(N, 256, stream);
|
|
partReduce_ = new ReducePartitioning<T, I>(N, 256, stream, allocfn, freefn);
|
|
|
|
// Allocate data arrays
|
|
if (allocate_data)
|
|
allocate();
|
|
}
|
|
|
|
// Copy constructor does not copy data array values
|
|
explicit Vector(const Vector& v)
|
|
: size_(v.size()),
|
|
mem_size_(size_*sizeof(T)),
|
|
partStream_(v.partStream_),
|
|
partReduce_(v.partReduce_),
|
|
ownPartitioning_(false),
|
|
ownData_(true),
|
|
managed_mem_(v.managed_mem_),
|
|
allocfn_(v.allocfn_),
|
|
freefn_(v.freefn_),
|
|
h_vec_(nullptr),
|
|
d_vec_(nullptr)
|
|
{
|
|
allocate();
|
|
}
|
|
|
|
~Vector()
|
|
{
|
|
cudaError_t err;
|
|
|
|
if (ownPartitioning_) {
|
|
delete partReduce_;
|
|
delete partStream_;
|
|
}
|
|
|
|
if (ownData_) {
|
|
if (freefn_) {
|
|
freefn_(d_vec_);
|
|
d_vec_ = nullptr;
|
|
h_vec_ = nullptr;
|
|
} else {
|
|
if (!managed_mem_)
|
|
free(h_vec_);
|
|
err = cudaFree(d_vec_);
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to free device vector "
|
|
<< "in suncudavec::Vector::~Vector "
|
|
<< "(error code " << err << ")\n";
|
|
d_vec_ = nullptr;
|
|
h_vec_ = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
void allocate()
|
|
{
|
|
if (allocfn_) {
|
|
allocateCustom();
|
|
} else if (managed_mem_) {
|
|
allocateManaged();
|
|
} else {
|
|
allocateUnmanaged();
|
|
}
|
|
}
|
|
|
|
void allocateManaged()
|
|
{
|
|
cudaError_t err;
|
|
err = cudaMallocManaged((void**) &d_vec_, mem_size_);
|
|
if (err != cudaSuccess)
|
|
std::cerr << "Failed to allocate managed vector "
|
|
<< "in suncudavec::Vector::allocateManaged "
|
|
<< "(error code " << err << ")\n";
|
|
h_vec_ = d_vec_;
|
|
}
|
|
|
|
void allocateUnmanaged()
|
|
{
|
|
cudaError_t err;
|
|
|
|
h_vec_ = static_cast<T*>(malloc(mem_size_));
|
|
if(h_vec_ == nullptr)
|
|
std::cerr << "Failed to allocate host vector "
|
|
<< "in suncudavec::Vector::allocateUnmanaged\n";
|
|
|
|
err = cudaMalloc((void**) &d_vec_, mem_size_);
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to allocate device vector "
|
|
<< "in suncudavec::Vector::allocateUnmanaged "
|
|
<< "(error code " << err << ")\n";
|
|
}
|
|
|
|
void allocateCustom()
|
|
{
|
|
/* We assume managed memory when a custom allocator is provided */
|
|
d_vec_ = (realtype *) allocfn_(mem_size_);
|
|
if (d_vec_ == nullptr)
|
|
std::cerr << "Failed to allocate vector with user-provied allocator "
|
|
<< "in suncudavec::Vector::allocateCustom()\n";
|
|
h_vec_ = d_vec_;
|
|
}
|
|
|
|
int size() const
|
|
{
|
|
return size_;
|
|
}
|
|
|
|
T* host()
|
|
{
|
|
// If the vector is using managed memory, and a user
|
|
// is accessing a data array, then we need to synchronzie
|
|
// to ensure all kernels have completed since a memcpy
|
|
// won't have to happen.
|
|
if (managed_mem_)
|
|
cudaStreamSynchronize(partReduce_->stream());
|
|
return h_vec_;
|
|
}
|
|
|
|
const T* host() const
|
|
{
|
|
// If the vector is using managed memory, and a user
|
|
// is accessing a data array, then we need to synchronzie
|
|
// to ensure all kernels have completed since a memcpy
|
|
// won't have to happen.
|
|
if (managed_mem_)
|
|
cudaStreamSynchronize(partReduce_->stream());
|
|
return h_vec_;
|
|
}
|
|
|
|
T* device()
|
|
{
|
|
// If the vector is using managed memory, and a user
|
|
// is accessing a data array, then we need to synchronzie
|
|
// to ensure all kernels have completed since a memcpy
|
|
// won't have to happen.
|
|
if (managed_mem_)
|
|
cudaStreamSynchronize(partReduce_->stream());
|
|
return d_vec_;
|
|
}
|
|
|
|
const T* device() const
|
|
{
|
|
// If the vector is using managed memory, and a user
|
|
// is accessing a data array, then we need to synchronzie
|
|
// to ensure all kernels have completed since a memcpy
|
|
// won't have to happen.
|
|
if (managed_mem_)
|
|
cudaStreamSynchronize(partReduce_->stream());
|
|
return d_vec_;
|
|
}
|
|
|
|
bool isManaged() const
|
|
{
|
|
return managed_mem_;
|
|
}
|
|
|
|
void copyToDev()
|
|
{
|
|
cudaError_t err;
|
|
|
|
/* If the host and device pointers are the same, then we don't need
|
|
to do a copy (this happens in the managed memory case), but we
|
|
still need to synchronize the device to adhere to the unified
|
|
memory access rules. */
|
|
if (h_vec_ == d_vec_) {
|
|
err = cudaStreamSynchronize(partReduce_->stream());
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to synchronize stream in "
|
|
<< "suncudavec::Vector::copyToDev "
|
|
<< "(error code " << err << ")\n";
|
|
} else {
|
|
err = cudaMemcpyAsync(d_vec_, h_vec_, mem_size_, cudaMemcpyHostToDevice,
|
|
partReduce_->stream());
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to copy vector from host to device in "
|
|
<< "suncudavec::Vector::copyToDev "
|
|
<< "(error code " << err << ")\n";
|
|
}
|
|
}
|
|
|
|
void copyFromDev()
|
|
{
|
|
cudaError_t err;
|
|
|
|
/* If the host and device pointers are the same, then we don't need
|
|
to do a copy (this happens in the managed memory case), but we
|
|
still need to synchronize the device to adhere to the unified
|
|
memory access rules. */
|
|
if (h_vec_ == d_vec_) {
|
|
err = cudaStreamSynchronize(partReduce_->stream());
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to synchronize stream in "
|
|
<< "suncudavec::Vector::copyFromDev "
|
|
<< "(error code " << err << ")\n";
|
|
} else {
|
|
err = cudaMemcpyAsync(h_vec_, d_vec_, mem_size_, cudaMemcpyDeviceToHost,
|
|
partReduce_->stream());
|
|
if(err != cudaSuccess)
|
|
std::cerr << "Failed to copy vector from device to host in "
|
|
<< "suncudavec::Vector::copyFromDev "
|
|
<< "(error code " << err << ")\n";
|
|
}
|
|
}
|
|
|
|
void setPartitioning(ThreadPartitioning<T, I>* stream, ThreadPartitioning<T, I>* reduce)
|
|
{
|
|
if (ownPartitioning_) {
|
|
delete partStream_;
|
|
delete partReduce_;
|
|
}
|
|
partStream_ = stream;
|
|
partReduce_ = reduce;
|
|
ownPartitioning_ = false;
|
|
}
|
|
|
|
ThreadPartitioning<T, I>& partStream() const
|
|
{
|
|
return *partStream_;
|
|
}
|
|
|
|
ThreadPartitioning<T, I>& partReduce() const
|
|
{
|
|
return *partReduce_;
|
|
}
|
|
|
|
|
|
private:
|
|
I size_;
|
|
I mem_size_;
|
|
T* h_vec_;
|
|
T* d_vec_;
|
|
ThreadPartitioning<T, I>* partStream_;
|
|
ThreadPartitioning<T, I>* partReduce_;
|
|
bool ownPartitioning_;
|
|
bool ownData_;
|
|
bool managed_mem_;
|
|
SUNAllocFn allocfn_;
|
|
SUNFreeFn freefn_;
|
|
|
|
};
|
|
|
|
|
|
} // namespace suncudavec
|
|
|
|
|
|
|
|
|
|
#endif // _NVECTOR_CUDA_HPP_
|