-
Notifications
You must be signed in to change notification settings - Fork 225
fix TaskSetValue bug #174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix TaskSetValue bug #174
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,5 @@ | ||
| /** | ||
| * Copyright 2013 Felix Schmitt, Heiko Burau, Rene Widera | ||
| * Copyright 2013-2014 Felix Schmitt, Heiko Burau, Rene Widera | ||
| * | ||
| * This file is part of libPMacc. | ||
| * | ||
|
|
@@ -17,11 +17,9 @@ | |
| * You should have received a copy of the GNU General Public License | ||
| * and the GNU Lesser General Public License along with libPMacc. | ||
| * If not, see <http://www.gnu.org/licenses/>. | ||
| */ | ||
|
|
||
| */ | ||
|
|
||
| #ifndef _TASKSETVALUE_HPP | ||
| #define _TASKSETVALUE_HPP | ||
| #pragma once | ||
|
|
||
| #include <cuda_runtime_api.h> | ||
| #include <cuda.h> | ||
|
|
@@ -35,126 +33,104 @@ | |
| #include "eventSystem/tasks/StreamTask.hpp" | ||
| #include "mappings/simulation/EnvironmentController.hpp" | ||
|
|
||
| #include <boost/type_traits/remove_pointer.hpp> | ||
| #include <boost/type_traits.hpp> | ||
|
|
||
| namespace PMacc | ||
| { | ||
|
|
||
| template <class DataBox> | ||
| __global__ void kernelSetValue(DataBox data ,const DataSpace<DIM3> size) | ||
| namespace taskSetValueHelper | ||
| { | ||
| DataSpace<DIM3> idx; | ||
|
|
||
| idx[0] = blockDim.x * (blockIdx.x/size.z()) + threadIdx.x; | ||
| idx[1] = blockDim.y * blockIdx.y; | ||
| idx[2] = blockIdx.x%size.z(); | ||
|
|
||
| if (idx.x() >= size.x()) | ||
| return; | ||
| data[idx.z()][idx.y()][idx.x()] = *data; | ||
| } | ||
|
|
||
| template <class DataBox> | ||
| __global__ void kernelSetValue(DataBox data, const DataSpace<DIM2> size) | ||
| /** define access operation for non-pointer types | ||
| */ | ||
| template<typename T_Type, bool isPointer> | ||
| struct Value | ||
| { | ||
| DataSpace<DIM2> idx; | ||
|
|
||
| idx[0] = blockDim.x * blockIdx.x + threadIdx.x; | ||
| idx[1] = blockDim.y * blockIdx.y + threadIdx.y; | ||
| typedef const T_Type type; | ||
|
|
||
| if (idx.x() >= size.x()) | ||
| return; | ||
|
|
||
| data[idx.y()][idx.x()] = *data; | ||
| } | ||
| HDINLINE type& operator()(type& v) const | ||
| { | ||
| return v; | ||
| } | ||
| }; | ||
|
|
||
| template <class DataBox> | ||
| __global__ void kernelSetValue(DataBox data, const DataSpace<DIM1> size) | ||
| /** define access operation for pointer types | ||
| * | ||
| * access first element of a pointer | ||
| */ | ||
| template<typename T_Type> | ||
| struct Value<T_Type, true> | ||
| { | ||
| size_t idx; | ||
|
|
||
| idx = blockDim.x * blockIdx.x + threadIdx.x; | ||
|
|
||
| if (idx >= size.x()) | ||
| return; | ||
| typedef const T_Type PtrType; | ||
| typedef const typename boost::remove_pointer<PtrType>::type type; | ||
|
|
||
| data[idx] = *data; | ||
| } | ||
| HDINLINE type& operator()(PtrType v) const | ||
| { | ||
| return *v; | ||
| } | ||
| }; | ||
|
|
||
| template <class DataBox,typename TYPE> | ||
| __global__ void kernelSetValue(DataBox data, const TYPE value, const DataSpace<DIM3> size) | ||
| /** Get access to a value from a pointer or reference with the same method | ||
| */ | ||
| template<typename T_Type> | ||
| HDINLINE typename Value<T_Type, boost::is_pointer<T_Type>::value >::type& | ||
| getValue(T_Type value) | ||
| { | ||
| DataSpace<DIM3> idx; | ||
|
|
||
| idx[0] = blockDim.x * (blockIdx.x/size.z()) + threadIdx.x; | ||
| idx[1] = blockDim.y * blockIdx.y; | ||
| idx[2] = blockIdx.x%size.z(); | ||
|
|
||
| if (idx.x() >= size.x()) | ||
| return; | ||
| data[idx.z()][idx.y()][idx.x()] = value; | ||
| typedef Value<T_Type, boost::is_pointer<T_Type>::value > Functor; | ||
| return Functor()(value); | ||
| } | ||
|
|
||
| template <class DataBox,typename TYPE> | ||
| __global__ void kernelSetValue(DataBox data, const TYPE value, const DataSpace<DIM2> size) | ||
| { | ||
| DataSpace<DIM2> idx; | ||
|
|
||
| idx[0] = blockDim.x * blockIdx.x + threadIdx.x; | ||
| idx[1] = blockDim.y * blockIdx.y + threadIdx.y; | ||
|
|
||
| if (idx.x() >= size.x()) | ||
| return; | ||
|
|
||
| data[idx.y()][idx.x()] = value; | ||
| } | ||
|
|
||
| template <class DataBox,typename TYPE> | ||
| __global__ void kernelSetValue(DataBox data, const TYPE value, const DataSpace<DIM1> size) | ||
| template <class DataBox, typename T_ValueType, typename Space> | ||
| __global__ void kernelSetValue(DataBox data, const T_ValueType value, const Space size) | ||
| { | ||
| int idx; | ||
| const Space threadIndex(threadIdx); | ||
| const Space blockIndex(blockIdx); | ||
| const Space gridSize(blockDim); | ||
|
|
||
| idx = blockDim.x * blockIdx.x + threadIdx.x; | ||
| Space idx(gridSize * blockIndex + threadIndex); | ||
|
|
||
| if (idx >= size.x()) | ||
| if (idx.x() >= size.x()) | ||
| return; | ||
|
|
||
| data[idx] = value; | ||
| data(idx) = taskSetValueHelper::getValue(value); | ||
| } | ||
|
|
||
|
|
||
| template <class TYPE, unsigned DIM> | ||
| class DeviceBuffer; | ||
|
|
||
| /*Set a value for a GridBuffer on the defice | ||
| * TYPE = data type (e.g. float, float2) | ||
| * DIM = dimension of the GridBuffer | ||
| * SMALL = true if TYPE can send via kernel parameter (on cuda TYPE must be smaller than 256 byte) | ||
| /** Set all cells of a GridBuffer on the device to a given value | ||
| * | ||
| * T_ValueType = data type (e.g. float, float2) | ||
| * T_dim = dimension of the GridBuffer | ||
| * T_isSmallValue = true if T_ValueType can be send via kernel parameter (on cuda T_ValueType must be smaller than 256 byte) | ||
| */ | ||
| template <class TYPE, unsigned DIM,bool SMALL> | ||
| template <class T_ValueType, unsigned T_dim, bool T_isSmallValue> | ||
| class TaskSetValue; | ||
|
|
||
| template <class TYPE, unsigned DIM> | ||
| class TaskSetValue<TYPE,DIM,false> : public StreamTask | ||
| template <class T_ValueType, unsigned T_dim> | ||
| class TaskSetValueBase : public StreamTask | ||
| { | ||
| public: | ||
| typedef T_ValueType ValueType; | ||
| static const uint32_t dim = T_dim; | ||
|
|
||
| TaskSetValue(DeviceBuffer<TYPE, DIM>& dst, const TYPE& value) : | ||
| TaskSetValueBase(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) : | ||
| StreamTask(), | ||
| value(value) | ||
| { | ||
| this->destination = static_cast<DeviceBuffer<TYPE, DIM>*> (& dst); | ||
| this->destination = &dst; | ||
| } | ||
|
|
||
| virtual ~TaskSetValue() | ||
| virtual ~TaskSetValueBase() | ||
| { | ||
| notify(this->myId, SETVALUE, NULL); | ||
| CUDA_CHECK(cudaFreeHost(valuePointer_host)); | ||
| } | ||
|
|
||
| virtual void init() | ||
| { | ||
| setValue(); | ||
| } | ||
|
|
||
| virtual void init() = 0; | ||
|
|
||
| bool executeIntern() throw (std::runtime_error) | ||
| { | ||
| return isFinished(); | ||
|
|
@@ -171,101 +147,97 @@ class TaskSetValue<TYPE,DIM,false> : public StreamTask | |
| return "TaskSetValue"; | ||
| } | ||
|
|
||
| private: | ||
|
|
||
| void setValue() | ||
| { | ||
| size_t current_size = destination->getCurrentSize(); | ||
| const DataSpace<DIM> tmp(destination->getCurrentDataSpace(current_size)); | ||
| dim3 gridSize = tmp; | ||
|
|
||
| gridSize.x = (gridSize.x+255) / 256; //round up without ceil | ||
| gridSize.x *= gridSize.z; | ||
| gridSize.z = 1; | ||
|
|
||
| CUDA_CHECK(cudaMallocHost(&valuePointer_host, sizeof (TYPE))); | ||
| *valuePointer_host = value; //copy value to new place | ||
|
|
||
| CUDA_CHECK(cudaMemcpyAsync( | ||
| destination->getPointer(), valuePointer_host, sizeof (TYPE), | ||
| cudaMemcpyHostToDevice, this->getCudaStream())); | ||
| kernelSetValue << <gridSize, 256, 0, this->getCudaStream() >> > | ||
| (destination->getDataBox(), tmp); | ||
|
|
||
| this->activate(); | ||
| } | ||
|
|
||
| DeviceBuffer<TYPE, DIM> *destination; | ||
| //TYPE *valuePointer_dev; | ||
| TYPE *valuePointer_host; | ||
| TYPE value; | ||
| DeviceBuffer<ValueType, dim> *destination; | ||
| ValueType value; | ||
| }; | ||
|
|
||
| template <class TYPE, unsigned DIM> | ||
| class TaskSetValue<TYPE,DIM,true> : public StreamTask | ||
| /** implementation for small values (<= 256byte) | ||
| */ | ||
| template <class T_ValueType, unsigned T_dim> | ||
| class TaskSetValue<T_ValueType, T_dim, true> : public TaskSetValueBase<T_ValueType, T_dim> | ||
| { | ||
| public: | ||
| typedef T_ValueType ValueType; | ||
| static const uint32_t dim = T_dim; | ||
|
|
||
| TaskSetValue(DeviceBuffer<TYPE, DIM>& dst, const TYPE &value) : | ||
| StreamTask(), | ||
| value(value) | ||
| TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) : | ||
| TaskSetValueBase<ValueType, dim>(dst, value) | ||
| { | ||
| this->destination = & dst; | ||
| } | ||
|
|
||
| virtual ~TaskSetValue() | ||
| { | ||
| notify(this->myId, SETVALUE, NULL); | ||
|
|
||
| } | ||
|
|
||
| virtual void init() | ||
| { | ||
| size_t current_size = this->destination->getCurrentSize(); | ||
| const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size)); | ||
| dim3 gridSize = area_size; | ||
|
|
||
| setValue(); | ||
| /* line wise thread blocks*/ | ||
| gridSize.x = ceil(double(gridSize.x) / 256.); | ||
|
|
||
| } | ||
| kernelSetValue << <gridSize, 256, 0, this->getCudaStream() >> > | ||
| (this->destination->getDataBox(), this->value, area_size); | ||
|
|
||
| bool executeIntern() throw (std::runtime_error) | ||
| { | ||
| return isFinished(); | ||
| this->activate(); | ||
| } | ||
| }; | ||
|
|
||
| void event(id_t, EventType, IEventData*) | ||
| /** implementation for small values (>256 byte) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, did you mean 256 elements?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think 256 bytes is fine since he wants to pass the value directly as a kernel parameter. see his description of |
||
| * | ||
| * This class use CUDA memcopy to copy a instance of T_ValueType to gpu and | ||
| * run a kernel which assign this value to all cells. | ||
| */ | ||
| template <class T_ValueType, unsigned T_dim> | ||
| class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueType, T_dim> | ||
| { | ||
| public: | ||
| typedef T_ValueType ValueType; | ||
| static const uint32_t dim = T_dim; | ||
|
|
||
| TaskSetValue(DeviceBuffer<ValueType, dim>& dst, const ValueType& value) : | ||
| TaskSetValueBase<ValueType, dim>(dst, value), valuePointer_host(NULL) | ||
| { | ||
| } | ||
|
|
||
| protected: | ||
|
|
||
| std::string toString() | ||
| virtual ~TaskSetValue() | ||
| { | ||
| return "TaskSetValueSmall"; | ||
| if (valuePointer_host != NULL) | ||
| { | ||
| CUDA_CHECK(cudaFreeHost(valuePointer_host)); | ||
| valuePointer_host = NULL; | ||
| } | ||
| } | ||
|
|
||
| private: | ||
|
|
||
| void setValue() | ||
| void init() | ||
| { | ||
| size_t current_size = this->destination->getCurrentSize(); | ||
| const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size)); | ||
| dim3 gridSize = area_size; | ||
|
|
||
| /* line wise thread blocks*/ | ||
| gridSize.x = ceil(double(gridSize.x) / 256.); | ||
|
|
||
| size_t current_size = destination->getCurrentSize(); | ||
| DataSpace<DIM> tmp = destination->getCurrentDataSpace(current_size); | ||
| dim3 gridSize = tmp; | ||
| ValueType* devicePtr = this->destination->getPointer(); | ||
|
|
||
| gridSize.x = (gridSize.x+255) / 256; //round up without ceil | ||
| gridSize.x *= gridSize.z; | ||
| gridSize.z = 1; | ||
| CUDA_CHECK(cudaMallocHost(&valuePointer_host, sizeof (ValueType))); | ||
| *valuePointer_host = this->value; //copy value to new place | ||
|
|
||
| CUDA_CHECK(cudaMemcpyAsync( | ||
| devicePtr, valuePointer_host, sizeof (ValueType), | ||
| cudaMemcpyHostToDevice, this->getCudaStream())); | ||
| kernelSetValue << <gridSize, 256, 0, this->getCudaStream() >> > | ||
| (destination->getDataBox(), value,tmp); | ||
| (this->destination->getDataBox(), devicePtr, area_size); | ||
|
|
||
| this->activate(); | ||
| } | ||
|
|
||
| DeviceBuffer<TYPE, DIM> *destination; | ||
| TYPE value; | ||
| private: | ||
| ValueType *valuePointer_host; | ||
|
|
||
| }; | ||
|
|
||
| } //namespace PMacc | ||
|
|
||
|
|
||
| #endif /* _TASKSETVALUE_HPP */ | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
< 256 byte or 256 elements?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is realy byte