feat(reduction): add reduction PI add

This commit is contained in:
2025-11-15 18:03:11 +01:00
parent 2daed2f9eb
commit 1543a0f121
14 changed files with 160 additions and 1231 deletions

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/01_algorithme/generic/Reduction.cu.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/01_algorithme/generic/Reduction.cu.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/01_algorithme/add/ReductionAdd.cu.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/01_algorithme/add/ReductionAdd.cu.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PII/host/ReductionAddIntII.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PII/host/ReductionAddIntII.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/int/PI/host/ReductionIntI.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/int/PI/host/ReductionIntI.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/int/PII/host/ReductionIntII.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/int/PII/host/ReductionIntII.h

View File

@@ -1 +1 @@
/home/bilat/CBI/Dropbox/02_CBI_LINUX/CoursCuda/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/long/PII/host/ReductionLongII.h /home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/generic/long/PII/host/ReductionLongII.h

File diff suppressed because it is too large Load Diff

View File

@@ -1,13 +1,12 @@
#pragma once #pragma once
#include "Thread1D.cu.h" #include "Thread2D.cu.h"
/*----------------------------------------------------------------------*\ /*----------------------------------------------------------------------*\
|* Implementation *| |* Implementation *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/
class ReductionAdd class ReductionAdd {
{
public: public:
/** /**
@@ -46,8 +45,9 @@ class ReductionAdd
* *
*/ */
template <typename T> template <typename T>
static __device__ void reduce(T* tabSM, T* ptrResultGM) static
{ __device__
void reduce(T* tabSM, T* ptrResultGM) {
// Rappel : // Rappel :
// |ThreadByBlock|=|tabSM| . // |ThreadByBlock|=|tabSM| .
// Il y autant de case en SM que de thread par block. // Il y autant de case en SM que de thread par block.
@@ -55,8 +55,8 @@ class ReductionAdd
// 1 thread <---> 1 armoire // 1 thread <---> 1 armoire
// TODO ReductionAdd // TODO ReductionAdd
// reductionIntraBlock reductionIntraBlock(tabSM);
// reductionInterblock reductionInterBlock(tabSM, ptrResultGM);
// __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? pas a le fin en tous les cas // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? pas a le fin en tous les cas
} }
@@ -71,8 +71,9 @@ class ReductionAdd
* used by reductionIntraBlock * used by reductionIntraBlock
*/ */
template <typename T> template <typename T>
static __device__ void ecrasement(T* tabSM, int middle) static
{ __device__
void ecrasement(T* tabSM, int middle) {
// Indications : // Indications :
// (I1) je suis un thread, je dois faire quoi ? // (I1) je suis un thread, je dois faire quoi ?
// (I2) Tous les threads doivent-ils faire quelquechose? // (I2) Tous les threads doivent-ils faire quelquechose?
@@ -80,6 +81,12 @@ class ReductionAdd
// TODO ReductionAdd // TODO ReductionAdd
const int localTID = Thread2D::tidLocal();
if(localTID < middle) {
tabSM[localTID] = tabSM[localTID] + tabSM[localTID + middle];
}
// __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
} }
@@ -87,12 +94,22 @@ class ReductionAdd
* Sur place, le resultat est dans tabSM[0] * Sur place, le resultat est dans tabSM[0]
*/ */
template <typename T> template <typename T>
static __device__ void reductionIntraBlock(T* tabSM) static
{ __device__
void reductionIntraBlock(T* tabSM) { // Reduce tab SM (all in [0])
// Ecrasement sucessifs dans une boucle (utiliser la methode ecrasement ci-dessus) // Ecrasement sucessifs dans une boucle (utiliser la methode ecrasement ci-dessus)
// TODO ReductionAdd // TODO ReductionAdd
const int NB_THREAD_LOCAL = Thread2D::nbThreadLocal();
int middle = NB_THREAD_LOCAL>>1;
while (middle > 0) {
ecrasement(tabSM, middle);
__syncthreads();
middle = middle >> 1;
}
// __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
} }
@@ -101,13 +118,17 @@ class ReductionAdd
\*-------------------------------------*/ \*-------------------------------------*/
template <typename T> template <typename T>
static __device__ void reductionInterBlock(T* tabSM, T* ptrResultGM) static
{ __device__
void reductionInterBlock(T* tabSM, T* ptrResultGM) { // SM -> GM
// Indication: // Indication:
// (I1) Utiliser atomicAdd(pointeurDestination, valeurSource); // (I1) Utiliser atomicAdd(pointeurDestination, valeurSource);
// (i2) Travailler sous l hypothese d'une grid2d,avec Thread2D // (i2) Travailler sous l hypothese d'une grid2d,avec Thread2D
// TODO ReductionAdd // TODO ReductionAdd
if(Thread2D::tidLocal() == 0) {
atomicAdd(ptrResultGM, tabSM[0]);
}
// __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
} }

View File

@@ -1,7 +1,7 @@
#pragma once #pragma once
#include "Lock.cu.h" #include "Lock.cu.h"
#include "Thread1D.cu.h" #include "Thread2D.cu.h"
/*----------------------------------------------------------------------*\ /*----------------------------------------------------------------------*\
|* prt fonction / reduction *| |* prt fonction / reduction *|
@@ -14,8 +14,7 @@
|* Implementation *| |* Implementation *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/
class Reduction class Reduction {
{
public: public:
/** /**
@@ -50,7 +49,9 @@ class Reduction
* ReductionGeneric::reduce(add,addAtomic,tabSm,ptrResultGM); * ReductionGeneric::reduce(add,addAtomic,tabSm,ptrResultGM);
*/ */
template <typename T> template <typename T>
static __device__ void reduce(BinaryOperator(OP) ,AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM) static
__device__
void reduce(BinaryOperator(OP) ,AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM)
//static __device__ void reduce(T (*OP)(T, T) ,void (*ATOMIC_OP)(T*, T), T* tabSM, T* ptrResultGM) // idem ci-dessus mais sans define //static __device__ void reduce(T (*OP)(T, T) ,void (*ATOMIC_OP)(T*, T), T* tabSM, T* ptrResultGM) // idem ci-dessus mais sans define
{ {
// Meme principe que ReductionAdd // Meme principe que ReductionAdd
@@ -69,8 +70,9 @@ class Reduction
* used by reductionIntraBlock * used by reductionIntraBlock
*/ */
template <typename T> template <typename T>
static __device__ void ecrasement(BinaryOperator(OP),T* tabSM, int middle) static
{ __device__
void ecrasement(BinaryOperator(OP),T* tabSM, int middle) {
// TODO ReductionGeneric // TODO ReductionGeneric
// Meme principe que ReductionAdd // Meme principe que ReductionAdd
// OP est la variable representant l'operateur binaire // OP est la variable representant l'operateur binaire
@@ -80,8 +82,9 @@ class Reduction
* Sur place, le resultat est dans tabSM[0] * Sur place, le resultat est dans tabSM[0]
*/ */
template <typename T> template <typename T>
static __device__ void reductionIntraBlock(BinaryOperator(OP),T* tabSM) static
{ __device__
void reductionIntraBlock(BinaryOperator(OP),T* tabSM) {
// TODO ReductionGeneric // TODO ReductionGeneric
// Meme principe que ReductionAdd // Meme principe que ReductionAdd
// OP est la variable representant l'operateur binaire // OP est la variable representant l'operateur binaire
@@ -92,8 +95,9 @@ class Reduction
\*-------------------------------------*/ \*-------------------------------------*/
template <typename T> template <typename T>
static __device__ void reductionInterBlock(AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM) static
{ __device__
void reductionInterBlock(AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM) {
// TODO ReductionGeneric // TODO ReductionGeneric
// Meme principe que ReductionAdd // Meme principe que ReductionAdd
// ATOMIC_OP est la variable representant l'operateur binaire atomic // ATOMIC_OP est la variable representant l'operateur binaire atomic

View File

@@ -1,4 +1,4 @@
#include "Thread1D.cu.h" #include "Thread2D.cu.h"
#include "cudas.h" #include "cudas.h"
#include "ReductionAdd.cu.h" #include "ReductionAdd.cu.h"
@@ -9,7 +9,9 @@
|* Private *| |* Private *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/
static __device__ void reductionIntraThread(int* tabSM); static
__device__
void reductionIntraThread(int* tabSM);
/*----------------------------------------------------------------------*\ /*----------------------------------------------------------------------*\
|* Implementation *| |* Implementation *|
@@ -18,13 +20,19 @@ static __device__ void reductionIntraThread(int* tabSM);
/** /**
* 1 partout en tabSM * 1 partout en tabSM
*/ */
__global__ void KAddIntProtocoleI(int* ptrSumGM) __global__
{ void KAddIntProtocoleI(int* ptrSumGM) {
// TODO ReductionAddIntI // TODO ReductionAddIntI
// Reception tabSM // Reception tabSM
extern __shared__ int tabSM[];
// ReductionIntraThread // ReductionIntraThread
reductionIntraThread(tabSM);
__syncthreads();
// ReductionAdd // ReductionAdd
ReductionAdd::reduce(tabSM, ptrSumGM);
// __syncthreads(); // des threads de meme block!// Question : utile? ou? // __syncthreads(); // des threads de meme block!// Question : utile? ou?
} }
@@ -36,12 +44,15 @@ __global__ void KAddIntProtocoleI(int* ptrSumGM)
/** /**
* 1 partout en tabSM * 1 partout en tabSM
*/ */
__device__ void reductionIntraThread(int* tabSM) __device__
{ void reductionIntraThread(int* tabSM) {
// TODO ReductionAddIntI // TODO ReductionAddIntI
const int localTID = Thread2D::tidLocal();
tabSM[localTID] = 1;
} }
/*----------------------------------------------------------------------*\ /*----------------------------------------------------------------------*\
|* End *| |* End *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/

View File

@@ -27,32 +27,33 @@ extern __global__ void KAddIntProtocoleI(int* ptrSumGM);
ReductionAddIntI::ReductionAddIntI(const Grid& grid , int* ptrSum , bool isVerbose) : ReductionAddIntI::ReductionAddIntI(const Grid& grid , int* ptrSum , bool isVerbose) :
//RunnableGPU(grid, "Reduce_Add_IntI_" + to_string(grid.threadCounts()),isVerbose), // classe parente //RunnableGPU(grid, "Reduce_Add_IntI_" + to_string(grid.threadCounts()),isVerbose), // classe parente
RunnableGPU(grid, "Reduce_Add_IntI", isVerbose), // classe parente RunnableGPU(grid, "Reduce_Add_IntI", isVerbose), // classe parente
ptrSum(ptrSum) ptrSum(ptrSum) {
{
// TODO ReductionAddIntI // TODO ReductionAddIntI
// MM pour ptrSumGM (oubliez pas initialisation) // MM pour ptrSumGM (oubliez pas initialisation)
this->sizeSM = -1; this->sizeSM = grid.threadByBlock() * sizeof(int);
// Tip: Il y a une methode dedier pour malloquer un int cote device et l'initialiser a zero // Tip: Il y a une methode dedier pour malloquer un int cote device et l'initialiser a zero
// //
// GM::mallocInt0(&ptrSumGM); GM::mallocInt0(&ptrSumGM);
} }
ReductionAddIntI::~ReductionAddIntI() ReductionAddIntI::~ReductionAddIntI() {
{
// TODO ReductionAddIntI // TODO ReductionAddIntI
GM::free(ptrSumGM);
} }
/*--------------------------------------*\ /*--------------------------------------*\
|* Methode *| |* Methode *|
\*-------------------------------------*/ \*-------------------------------------*/
void ReductionAddIntI::run() void ReductionAddIntI::run() {
{
// TODO ReductionAddIntI // TODO ReductionAddIntI
// appeler le kernel // appeler le kernel
// recuperer le resulat coter host // recuperer le resulat coter host
KAddIntProtocoleI<<<dg,db,this->sizeSM>>>(ptrSumGM);
GM::memcpyDToH_int(ptrSum, ptrSumGM);
// Tip: Il y a une methode dedier ramener coter host un int // Tip: Il y a une methode dedier ramener coter host un int
// //
// GM::memcpyDtoH_int(ptrDestination, ptrSourceGM);); // GM::memcpyDtoH_int(ptrDestination, ptrSourceGM););

View File

@@ -8,8 +8,7 @@
|* Declaration *| |* Declaration *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/
class ReductionAddIntI: public RunnableGPU class ReductionAddIntI: public RunnableGPU {
{
/*--------------------------------------*\ /*--------------------------------------*\
|* Constructor *| |* Constructor *|
\*-------------------------------------*/ \*-------------------------------------*/

View File

@@ -26,8 +26,8 @@ int main(int argc , char** argv)
// public // public
{ {
cudaContext.deviceId = 0; // in [0,2] width Server Cuda3 cudaContext.deviceId = 1; // in [0,2] width Server Cuda3
cudaContext.launchMode = LaunchModeMOO::USE; // USE TEST (only) cudaContext.launchMode = LaunchModeMOO::TEST; // USE TEST (only)
cudaContext.deviceDriver = DeviceDriver::LOAD_ALL; // LOAD_CURRENT LOAD_ALL cudaContext.deviceDriver = DeviceDriver::LOAD_ALL; // LOAD_CURRENT LOAD_ALL
cudaContext.deviceInfo = DeviceInfo::ALL_SIMPLE; // NONE ALL ALL_SIMPLE CURRENT cudaContext.deviceInfo = DeviceInfo::ALL_SIMPLE; // NONE ALL ALL_SIMPLE CURRENT
@@ -45,4 +45,3 @@ int main(int argc , char** argv)
/*----------------------------------------------------------------------*\ /*----------------------------------------------------------------------*\
|* End *| |* End *|
\*---------------------------------------------------------------------*/ \*---------------------------------------------------------------------*/