From bf9a0a29414e589fe8ded1ff12b1a45fcac3dbd7 Mon Sep 17 00:00:00 2001
From: Klagarge <remi@heredero.ch>
Date: Mon, 24 Nov 2025 23:21:59 +0100
Subject: [PATCH] feat(sliceSM): add sliceSM

---
 .zed/tasks.json                               |  6 +--
 Student_Cuda/.clangd                          |  3 ++
 .../03_Slice_SM/device/sliceSM_device.cu      | 39 ++++++++++++++-----
 .../03_Slice/03_Slice_SM/host/SliceSM.cu      | 30 +++++++-------
 .../03_Slice/03_Slice_SM/host/SliceSM.h       |  5 +--
 .../03_Slice_SM/host/SliceSM_BestGrid.h       | 25 ++++++------
 Student_Cuda/src/main/main.cpp                |  2 +-
 Student_Cuda/src/main/mainBenchmark.cpp       |  5 ++-
 Student_Cuda/src/main/mainBruteforce.cpp      |  4 +-
 Student_Cuda/src/main/mainTest.cpp            | 13 +++----
 Student_Cuda/src/main/mainUse.cpp             |  8 ++--
 11 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/.zed/tasks.json b/.zed/tasks.json
index a6c8342..e3d5ef1 100644
--- a/.zed/tasks.json
+++ b/.zed/tasks.json
@@ -1,10 +1,8 @@
-// Project tasks configuration. See https://zed.dev/docs/tasks for documentation.
-//
-// Example:
 [
     {
-        "label": "Run",
+        "label": "Run CUDA - Student_Cuda",
         "command": "cbicc cuda clean jall run",
+        "cwd": "Student_Cuda",
         "use_new_terminal": false,
         "allow_concurrent_runs": false,
         "reveal": "always",
diff --git a/Student_Cuda/.clangd b/Student_Cuda/.clangd
index 7c92112..ba398d5 100644
--- a/Student_Cuda/.clangd
+++ b/Student_Cuda/.clangd
@@ -2,6 +2,9 @@ CompileFlags:
     Add:
         - "-I/home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda/INC_SYMLINK/EXT"
         - "-I/home/mse15/CUDA/toStudent/code/WCudaStudent/Student_Cuda/INC_SYMLINK/PROJECT"
+        - "-std=c++17"
+        - "-x"
+        - "cuda"
 ---
 Diagnostics:
   Suppress: "*"
diff --git a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/device/sliceSM_device.cu b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/device/sliceSM_device.cu
index b6ee84c..fa4abe8 100755
--- a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/device/sliceSM_device.cu
+++ b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/device/sliceSM_device.cu
@@ -17,16 +17,21 @@ static __device__ float f(float x);
  |*			Implementation 					*|
  \*---------------------------------------------------------------------*/
 
-__global__ void sliceSM(int nbSlice , float* ptrPiHatGM)
-    {
+__global__
+void sliceSM(int nbSlice , float* ptrPiHatGM) {
     // TODO SliceSM
 
     // Reception tabSM
+    extern __shared__ float tabSM[];
     // ReductionIntraThread
+    reductionIntraThread(tabSM, nbSlice);
+    __syncthreads();
     // Reduction de tabSM (use tools ReductionAdd)
+    ReductionAdd::reduce(tabSM, ptrPiHatGM);
+
 
     // __syncthreads(); necessaire? ou? pas a la fin en tout cas
-    }
+}
 
 /*--------------------------------------*\
  |*		Private			*|
@@ -35,19 +40,33 @@ __global__ void sliceSM(int nbSlice , float* ptrPiHatGM)
 /**
  * remplit la sm
  */
-void reductionIntraThread(float* tabSM , int nbSlice)
-    {
+static
+__device__
+void reductionIntraThread(float* tabSM , int nbSlice) {
     // TODO SliceSM
 
     // Warning: Il faut employer TID et TID_LOCAL
-    }
+    const int TID = Thread2D::tid();
+    const int localTID = Thread2D::tidLocal();
+    const int NB_THREAD = Thread2D::nbThread();
 
-__device__ float f(float x)
-    {
-    return 4.f / (1.f + x * x);
+    const float delta_x = 1.f / (float)nbSlice;
+
+    int s = TID;
+    tabSM[localTID] = 0.f;
+
+    while (s < nbSlice) {
+        float xi = s * delta_x;
+        tabSM[localTID] += f(xi);
+        s += NB_THREAD;
     }
+}
+
+__device__
+float f(float x) {
+    return 4.f / (1.f + x * x);
+}
 
 /*----------------------------------------------------------------------*\
  |*			End	 					*|
  \*---------------------------------------------------------------------*/
-
diff --git a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.cu b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.cu
index 63836d6..493ab71 100755
--- a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.cu
+++ b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.cu
@@ -25,34 +25,31 @@ extern __global__ void sliceSM(int nbSlice,float* ptrPiHatGM);
  \*-------------------------------------*/
 
 SliceSM::SliceSM(const Grid& grid , int nbSlice , double* ptrPiHat , bool isVerbose) :
-	RunnableGPU(grid, "SliceSM_" + to_string(nbSlice), isVerbose), // classe parente
-	//
-	ptrPiHat(ptrPiHat), //
-	nbSlice(nbSlice) //
-    {
-    this->sizeSM = -1; //TODO SliceSM
+	RunnableGPU(grid, "SliceSM_" + to_string(nbSlice), isVerbose),
+	ptrPiHat(ptrPiHat),
+	nbSlice(nbSlice) {
+    this->sizeSM = grid.threadByBlock() * sizeof(float); //TODO SliceSM
 
     // MM
 	{
 	// TODO SliceSM (pas oublier de mettre a zero, avec mallocfloat0 par exemple)
-
+	GM::mallocFloat0(&ptrPiHatGM);
 	// Tip:		Il y a une methode dedier pour malloquer un float cote device et l'initialiser a zero
 	//
 	//		   GM::mallocfloat0(&ptrPiHatGM);
 	}
-    }
+}
 
-SliceSM::~SliceSM(void)
-    {
+SliceSM::~SliceSM(void) {
     //TODO SliceSM
-    }
+    GM::free(ptrPiHatGM);
+}
 
 /*--------------------------------------*\
  |*		Methode			*|
  \*-------------------------------------*/
 
-void SliceSM::run()
-    {
+void SliceSM::run() {
     // Etape 1 : lancer le kernel
     // Etape 2 : recuperer le resultat coter host (par exemple avec memcpyDToH_float)
     // Etape 3 : finaliser le calcul de PI
@@ -60,7 +57,12 @@ void SliceSM::run()
     // Solution : double result; // et ramener dans result, transferer et finaliser ensuite dans ptrPiHat
 
     // TODO SliceSM
-    }
+    sliceSM<<<dg,db,this->sizeSM>>>(this->nbSlice, this->ptrPiHatGM);
+    float result;
+    GM::memcpyDToH_float(&result, this->ptrPiHatGM);
+    const double delta_x = 1.0 / (double) this->nbSlice;
+    *this->ptrPiHat = (double) result * delta_x;
+}
 
 /////////////////////////
 //	Rappel:
diff --git a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.h b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.h
index 7264654..3645e7b 100755
--- a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.h
+++ b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM.h
@@ -8,8 +8,7 @@
  |*			Declaration 					*|
  \*---------------------------------------------------------------------*/
 
-class SliceSM: public RunnableGPU
-    {
+class SliceSM: public RunnableGPU {
 	/*--------------------------------------*\
 	|*		Constructor		*|
 	 \*-------------------------------------*/
@@ -51,7 +50,7 @@ class SliceSM: public RunnableGPU
 	size_t sizeSM; // [octet]
 	float* ptrPiHatGM;
 
-    };
+};
 
 /*----------------------------------------------------------------------*\
  |*			End	 					*|
diff --git a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM_BestGrid.h b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM_BestGrid.h
index b59cebd..6f4e9ce 100755
--- a/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM_BestGrid.h
+++ b/Student_Cuda/src/core/01_student/03_Slice/03_Slice_SM/host/SliceSM_BestGrid.h
@@ -11,29 +11,26 @@
  |*			Impelmentation 					*|
  \*---------------------------------------------------------------------*/
 
-namespace sliceSM
-    {
+namespace sliceSM {
 
-    class BestGrid
-	{
+    class BestGrid {
 
 	public:
 
-	    static Grid get()
-		{
-		const int MP = Hardware::getMPCount();
+	    static Grid get() {
+    		const int MP = Hardware::getMPCount();
+            const int CORE_MP = Hardware::getCoreCountMP();
 
-		// TODO SliceGMHOST grid
+    		// TODO SliceGMHOST grid
+            dim3 dg(MP, 6, 1);
+            dim3 db(CORE_MP, 2, 1);
+            Grid grid(dg, db);
 
-		// to remove once coded
-		    {
-		    Couts::redln("aie aie aie, your best grid won t build itself");
-		    assert(false);
-		    }
+            return grid;
 		}
 
 	};
-    }
+}
 
 /*----------------------------------------------------------------------*\
  |*			End	 					*|
diff --git a/Student_Cuda/src/main/main.cpp b/Student_Cuda/src/main/main.cpp
index 5e198e6..c757d67 100755
--- a/Student_Cuda/src/main/main.cpp
+++ b/Student_Cuda/src/main/main.cpp
@@ -29,7 +29,7 @@ int main(int argc , char** argv)
     // public
 	{
 	cudaContext.deviceId = 1; //  in [0,2] width Server Cuda3
-	cudaContext.launchMode = LaunchModeMOO::USE; // USE TEST  BENCHMARK  FORCEBRUT
+	cudaContext.launchMode = LaunchModeMOO::TEST; // USE TEST  BENCHMARK  FORCEBRUT
 
 	cudaContext.deviceDriver = DeviceDriver::LOAD_ALL; // LOAD_CURRENT   LOAD_ALL
 	cudaContext.deviceInfo = DeviceInfo::ALL_SIMPLE;   // NONE  ALL  ALL_SIMPLE  CURRENT
diff --git a/Student_Cuda/src/main/mainBenchmark.cpp b/Student_Cuda/src/main/mainBenchmark.cpp
index 148447c..4f562cf 100755
--- a/Student_Cuda/src/main/mainBenchmark.cpp
+++ b/Student_Cuda/src/main/mainBenchmark.cpp
@@ -34,6 +34,7 @@ using std::endl;
 
 static void sliceGMHOST();
 static void sliceGM();
+static void sliceSM();
 
 
 static void montecarloMono();
@@ -59,8 +60,8 @@ int mainBenchmark()
     // Slice
 	{
 	// sliceGMHOST();
-	sliceGM();
-	//sliceSM();
+	// sliceGM();
+	sliceSM();
 	//sliceMulti();
 	}
 
diff --git a/Student_Cuda/src/main/mainBruteforce.cpp b/Student_Cuda/src/main/mainBruteforce.cpp
index 02ca114..387e73c 100755
--- a/Student_Cuda/src/main/mainBruteforce.cpp
+++ b/Student_Cuda/src/main/mainBruteforce.cpp
@@ -75,8 +75,8 @@ int mainBrutforce()
     // Slice
 	{
 	// sliceGMHOST(&matlab);
-	sliceGM(&matlab);
-//	sliceSM(&matlab);
+	// sliceGM(&matlab);
+	sliceSM(&matlab);
 	}
 
     // Montecarlo
diff --git a/Student_Cuda/src/main/mainTest.cpp b/Student_Cuda/src/main/mainTest.cpp
index 4046c5c..b8858d8 100755
--- a/Student_Cuda/src/main/mainTest.cpp
+++ b/Student_Cuda/src/main/mainTest.cpp
@@ -52,17 +52,16 @@ int mainTest()
 /**
  * activer ci-dessous la version souhaiter
  */
-void slice()
-    {
+void slice() {
     // VTSliceGMHOST test1;
-    VTSliceGM test2;
-    // VTSliceSM test3;
+    // VTSliceGM test2;
+    VTSliceSM test3;
 
 
     // test1.run();
-   test2.run();
-   // test3.run();
-    }
+    // test2.run();
+    test3.run();
+}
 
 /**
  * activer ci-dessous la version souhaiter
diff --git a/Student_Cuda/src/main/mainUse.cpp b/Student_Cuda/src/main/mainUse.cpp
index e827da5..fb9ac41 100755
--- a/Student_Cuda/src/main/mainUse.cpp
+++ b/Student_Cuda/src/main/mainUse.cpp
@@ -67,12 +67,12 @@ int mainUse()
 void slice(bool& isOk)
     {
     // SliceGmHostUse sliceGmHostUse(IS_VERBOSE);
-    SliceGmUse sliceGmUse(IS_VERBOSE);
-    // SliceSmUse sliceSmUse(IS_VERBOSE);
+    // SliceGmUse sliceGmUse(IS_VERBOSE);
+    SliceSmUse sliceSmUse(IS_VERBOSE);
 
     // isOk &= sliceGmHostUse.isOk(IS_VERBOSE);
-   isOk &= sliceGmUse.isOk(IS_VERBOSE);
-//    isOk &= sliceSmUse.isOk(IS_VERBOSE);
+    // isOk &= sliceGmUse.isOk(IS_VERBOSE);
+    isOk &= sliceSmUse.isOk(IS_VERBOSE);
     }
 
 /**