feat(reduction): add reduction PI add

2025-11-15 18:03:11 +01:00
parent 2daed2f9eb
commit 1543a0f121
14 changed files with 160 additions and 1231 deletions
--- a/Student_Cuda_Tools_Reduction/src/core/01_algorithme/add/ReductionAdd.cu.h
+++ b/Student_Cuda_Tools_Reduction/src/core/01_algorithme/add/ReductionAdd.cu.h
@@ -1,13 +1,12 @@
 #pragma once

-#include "Thread1D.cu.h"
+#include "Thread2D.cu.h"

 /*----------------------------------------------------------------------*\
 |*			Implementation 					*|
 \*---------------------------------------------------------------------*/

-class ReductionAdd
-    {
+class ReductionAdd {
    public:

 	/**
@@ -46,8 +45,9 @@ class ReductionAdd
 	 *
 	 */
 	template <typename T>
-	static __device__ void reduce(T* tabSM, T* ptrResultGM)
-	    {
+	static
+	__device__
+	void reduce(T* tabSM, T* ptrResultGM) {
 	    // Rappel :
 	    // 		|ThreadByBlock|=|tabSM| .
 	    // 		Il y autant de case en SM que de thread par block.
@@ -55,11 +55,11 @@ class ReductionAdd
 	    //		1 thread <---> 1 armoire

 	    // TODO ReductionAdd
-	    // reductionIntraBlock
-	    // reductionInterblock
+	    reductionIntraBlock(tabSM);
+	    reductionInterBlock(tabSM, ptrResultGM);

 	    // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou? pas a le fin en tous les cas
-	    }
+    }

    private:

@@ -71,8 +71,9 @@ class ReductionAdd
 	 * used by reductionIntraBlock
 	 */
 	template <typename T>
-	static __device__ void ecrasement(T* tabSM, int middle)
-	    {
+	static
+	__device__
+	void ecrasement(T* tabSM, int middle) {
 	    // Indications :
 	    //	(I1) je suis un thread, je dois faire quoi ?
 	    // 	(I2) Tous les threads doivent-ils faire quelquechose?
@@ -80,39 +81,59 @@ class ReductionAdd

 	    // TODO ReductionAdd

+		const int localTID = Thread2D::tidLocal();
+
+		if(localTID < middle) {
+		    tabSM[localTID] = tabSM[localTID] + tabSM[localTID + middle];
+		}
+
 	    // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
-	    }
+    }

 	/**
 	 * Sur place, le resultat est dans tabSM[0]
 	 */
 	template <typename T>
-	static __device__ void reductionIntraBlock(T* tabSM)
-	    {
+	static
+	__device__
+	void reductionIntraBlock(T* tabSM) { // Reduce tab SM (all in [0])
 	    // Ecrasement sucessifs dans une boucle (utiliser la methode ecrasement ci-dessus)

 	    // TODO ReductionAdd

+		const int NB_THREAD_LOCAL = Thread2D::nbThreadLocal();
+		int middle = NB_THREAD_LOCAL>>1;
+
+		while (middle > 0) {
+            ecrasement(tabSM, middle);
+            __syncthreads();
+            middle = middle >> 1;
+		}
+
 	    // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
-	    }
+    }

 	/*--------------------------------------*\
 	|*	reductionInterblock		*|
 	 \*-------------------------------------*/

 	template <typename T>
-	static __device__ void reductionInterBlock(T* tabSM, T* ptrResultGM)
-	    {
+	static
+	__device__
+	void reductionInterBlock(T* tabSM, T* ptrResultGM) { // SM -> GM
 	    // Indication:
 	    //		(I1) Utiliser atomicAdd(pointeurDestination, valeurSource);
 	    //		(i2) Travailler sous l hypothese d'une grid2d,avec Thread2D

 	    // TODO ReductionAdd
+		if(Thread2D::tidLocal() == 0) {
+		    atomicAdd(ptrResultGM, tabSM[0]);
+		}

 	    // __syncthreads();// pour touts les threads d'un meme block, necessaires? ou?
-	    }
+    }

-    };
+};

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
--- a/Student_Cuda_Tools_Reduction/src/core/01_algorithme/generic/Reduction.cu.h
+++ b/Student_Cuda_Tools_Reduction/src/core/01_algorithme/generic/Reduction.cu.h
@@ -1,7 +1,7 @@
 #pragma once

 #include "Lock.cu.h"
-#include "Thread1D.cu.h"
+#include "Thread2D.cu.h"

 /*----------------------------------------------------------------------*\
 |*			prt fonction / reduction			*|
@@ -14,8 +14,7 @@
 |*			Implementation 					*|
 \*---------------------------------------------------------------------*/

-class Reduction
-    {
+class Reduction {
    public:

 	/**
@@ -50,14 +49,16 @@ class Reduction
 	 * 			ReductionGeneric::reduce(add,addAtomic,tabSm,ptrResultGM);
 	 */
 	template <typename T>
-	static __device__ void reduce(BinaryOperator(OP) ,AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM)
+	static
+	__device__
+	void reduce(BinaryOperator(OP) ,AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM)
 	//static __device__ void reduce(T (*OP)(T, T) ,void (*ATOMIC_OP)(T*, T), T* tabSM, T* ptrResultGM) // idem ci-dessus mais sans define
-	    {
+	{
 	    // Meme principe que ReductionAdd

 	    // TODO ReductionGeneric
 	    // Meme principe que ReductionAdd
-	    }
+	}

    private:

@@ -69,37 +70,40 @@ class Reduction
 	 * used by reductionIntraBlock
 	 */
 	template <typename T>
-	static __device__ void ecrasement(BinaryOperator(OP),T* tabSM, int middle)
-	    {
+	static
+	__device__
+	void ecrasement(BinaryOperator(OP),T* tabSM, int middle) {
 	    // TODO ReductionGeneric
 	    // Meme principe que ReductionAdd
 	    // OP est la variable representant l'operateur binaire
-	    }
+    }

 	/**
 	 * Sur place, le resultat est dans tabSM[0]
 	 */
 	template <typename T>
-	static __device__ void reductionIntraBlock(BinaryOperator(OP),T* tabSM)
-	    {
+	static
+	__device__
+	void reductionIntraBlock(BinaryOperator(OP),T* tabSM) {
 	    // TODO ReductionGeneric
 	    // Meme principe que ReductionAdd
 	    // OP est la variable representant l'operateur binaire
-	    }
+    }

 	/*--------------------------------------*\
 	|*	reductionInterblock		*|
 	 \*-------------------------------------*/

 	template <typename T>
-	static __device__ void reductionInterBlock(AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM)
-	    {
+	static
+	__device__
+	void reductionInterBlock(AtomicOp(ATOMIC_OP), T* tabSM, T* ptrResultGM) {
 	    // TODO ReductionGeneric
 	    // Meme principe que ReductionAdd
 	    // ATOMIC_OP est la variable representant l'operateur binaire atomic
-	    }
+    }

-    };
+};

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
--- a/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/device/reductionAddIntI_device.cu
+++ b/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/device/reductionAddIntI_device.cu
@@ -1,4 +1,4 @@
-#include "Thread1D.cu.h"
+#include "Thread2D.cu.h"
 #include "cudas.h"

 #include "ReductionAdd.cu.h"
@@ -9,7 +9,9 @@
 |*			Private 					*|
 \*---------------------------------------------------------------------*/

-static __device__ void reductionIntraThread(int* tabSM);
+static
+__device__
+void reductionIntraThread(int* tabSM);

 /*----------------------------------------------------------------------*\
 |*			Implementation 					*|
@@ -18,13 +20,19 @@ static __device__ void reductionIntraThread(int* tabSM);
 /**
 * 1 partout en tabSM
 */
-__global__ void KAddIntProtocoleI(int* ptrSumGM)
-    {
+__global__
+void KAddIntProtocoleI(int* ptrSumGM) {
    // TODO ReductionAddIntI

    // Reception tabSM
+    extern __shared__ int tabSM[];
+
    // ReductionIntraThread
+    reductionIntraThread(tabSM);
+    __syncthreads();
+
    // ReductionAdd
+    ReductionAdd::reduce(tabSM, ptrSumGM);

    // __syncthreads(); // des threads de meme block!// Question : utile? ou?
    }
@@ -36,12 +44,15 @@ __global__ void KAddIntProtocoleI(int* ptrSumGM)
 /**
 * 1 partout en tabSM
 */
-__device__ void reductionIntraThread(int* tabSM)
-    {
+__device__
+void reductionIntraThread(int* tabSM) {
    // TODO ReductionAddIntI
-    }
+    const int localTID = Thread2D::tidLocal();
+
+    tabSM[localTID] = 1;
+
+}

 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/
-
--- a/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.cu
+++ b/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.cu
@@ -27,32 +27,33 @@ extern __global__ void KAddIntProtocoleI(int* ptrSumGM);
 ReductionAddIntI::ReductionAddIntI(const Grid& grid , int* ptrSum , bool isVerbose) :
 	//RunnableGPU(grid, "Reduce_Add_IntI_" + to_string(grid.threadCounts()),isVerbose), // classe parente
 	RunnableGPU(grid, "Reduce_Add_IntI", isVerbose), // classe parente
-	ptrSum(ptrSum)
-    {
+	ptrSum(ptrSum) {
    // TODO ReductionAddIntI
    // MM pour ptrSumGM (oubliez pas initialisation)
-    this->sizeSM = -1;
+    this->sizeSM = grid.threadByBlock() * sizeof(int);

    // Tip:  Il y a une methode dedier pour malloquer un int cote device et l'initialiser a zero
    //
-    //		GM::mallocInt0(&ptrSumGM);
-    }
+    GM::mallocInt0(&ptrSumGM);
+}

-ReductionAddIntI::~ReductionAddIntI()
-    {
+ReductionAddIntI::~ReductionAddIntI() {
    // TODO ReductionAddIntI
-    }
+    GM::free(ptrSumGM);
+}

 /*--------------------------------------*\
 |*		Methode			*|
 \*-------------------------------------*/

-void ReductionAddIntI::run()
-    {
+void ReductionAddIntI::run() {
    // TODO ReductionAddIntI
    // appeler le kernel
    // recuperer le resulat coter host

+    KAddIntProtocoleI<<<dg,db,this->sizeSM>>>(ptrSumGM);
+    GM::memcpyDToH_int(ptrSum, ptrSumGM);
+
    // Tip:  Il y a une methode dedier ramener coter host un int
    //
    //		GM::memcpyDtoH_int(ptrDestination, ptrSourceGM););
--- a/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.h
+++ b/Student_Cuda_Tools_Reduction/src/core/02_use_protocole/add/int/PI/host/ReductionAddIntI.h
@@ -8,8 +8,7 @@
 |*			Declaration 					*|
 \*---------------------------------------------------------------------*/

-class ReductionAddIntI: public RunnableGPU
-    {
+class ReductionAddIntI: public RunnableGPU {
 	/*--------------------------------------*\
 	|*		Constructor		*|
 	 \*-------------------------------------*/
--- a/Student_Cuda_Tools_Reduction/src/main/main.cpp
+++ b/Student_Cuda_Tools_Reduction/src/main/main.cpp
@@ -26,8 +26,8 @@ int main(int argc , char** argv)

    // public
 	{
-	cudaContext.deviceId = 0; //  in [0,2] width Server Cuda3
-	cudaContext.launchMode = LaunchModeMOO::USE; // USE TEST  (only)
+	cudaContext.deviceId = 1; //  in [0,2] width Server Cuda3
+	cudaContext.launchMode = LaunchModeMOO::TEST; // USE TEST  (only)

 	cudaContext.deviceDriver = DeviceDriver::LOAD_ALL; // LOAD_CURRENT   LOAD_ALL
 	cudaContext.deviceInfo = DeviceInfo::ALL_SIMPLE;   // NONE  ALL  ALL_SIMPLE  CURRENT
@@ -45,4 +45,3 @@ int main(int argc , char** argv)
 /*----------------------------------------------------------------------*\
 |*			End	 					*|
 \*---------------------------------------------------------------------*/
-