From 6c5c8750b3fe3ec4ca0d26055ceda33305e13d6d Mon Sep 17 00:00:00 2001 From: fastium Date: Thu, 28 May 2026 15:37:12 +0200 Subject: [PATCH] feat(lab05): add ex 1 with report --- .gitignore | 2 + doc/lab05-optimization/main.typ | 184 +++++++++++++++++++++++++++ src/05-optimization/ex01/basic.c | 23 ++++ src/05-optimization/ex01/optimized.c | 22 ++++ 4 files changed, 231 insertions(+) create mode 100644 doc/lab05-optimization/main.typ create mode 100644 src/05-optimization/ex01/basic.c create mode 100644 src/05-optimization/ex01/optimized.c diff --git a/.gitignore b/.gitignore index a336197..bd01306 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,5 @@ build src/03-led-controller/led-controller src/04-multiprocessing/multiprocessing src/04-multiprocessing/cgroups +src/05-optimization/ex01/basic +src/05-optimization/ex01/optimized diff --git a/doc/lab05-optimization/main.typ b/doc/lab05-optimization/main.typ new file mode 100644 index 0000000..220bc64 --- /dev/null +++ b/doc/lab05-optimization/main.typ @@ -0,0 +1,184 @@ +#import "/doc/metadata.typ": * + += Optimization + +In this laboratory, the usage of `perf` as tool is experimented. + + + +``` +Performance counter stats for './ex1': + + 40609.10 msec task-clock # 1.000 CPUs utilized + 22 context-switches # 0.542 /sec + 0 cpu-migrations # 0.000 /sec + 48867 page-faults # 1.203 K/sec + 33136692484 cycles # 0.816 GHz + 1671194529 instructions # 0.05 insn per cycle + 269592231 branches # 6.639 M/sec + 1013366 branch-misses # 0.38% of all branches + + 40.618926728 seconds time elapsed + + 39.901620000 seconds user + 0.296158000 seconds sys + +``` +This program has done 22 context-switches and has 40.6s elapsed. + +#task([ +Measure the performance of the ex1 +],[ +``` +Performance counter stats for './ex1': + + 40609.10 msec task-clock # 1.000 CPUs utilized + 22 context-switches # 0.542 /sec + 0 cpu-migrations # 0.000 /sec + 48867 page-faults # 1.203 K/sec + 33136692484 cycles # 0.816 GHz + 1671194529 instructions # 0.05 insn per cycle + 269592231 branches # 6.639 M/sec + 1013366 branch-misses # 0.38% of all branches + + 40.618926728 seconds time elapsed + + 39.901620000 seconds user + 0.296158000 seconds sys + +``` +This program has done 22 context-switches and has 40.6s elapsed. +]) + +#task([ +Which error is in the program of ex1 ? +],[ +The program has 2 loops to go trhough the array. But, there is another loops which encapsulate the 2 others. It involves that the whole array is iterated through 10 times for an addition operation. That's the problem. This can be solve by removing the extren loop and putting a addition of 10: + +```c + int i, j; + for (i = 0; i < SIZE; i++) + { + for (j = 0; j < SIZE; j++) + { + array[j][i]+= 10; + } + } +``` + +With these modifications the performance must be a multiple of 10. + +``` + Performance counter stats for './optimized': + + 4759.07 msec task-clock # 0.998 CPUs utilized + 20 context-switches # 4.203 /sec + 0 cpu-migrations # 0.000 /sec + 48866 page-faults # 10.268 K/sec + 3883198165 cycles # 0.816 GHz + 282691820 instructions # 0.07 insn per cycle + 40234737 branches # 8.454 M/sec + 653642 branch-misses # 1.62% of all branches + + 4.768030627 seconds time elapsed + + 4.385881000 seconds user + 0.320226000 seconds sys +``` + +This can be observe by doing the same as before with `perf`. Before the time elapsed was around 40s and now about 4.7s. The same observation can be done with the cache missing: +- optimzed : 42103472 +- basic : 406627550 + + +]) + + +#task([ + Show l1 cache missing for ex1 : +],[ + #table( + columns: (1.5fr, 1fr), + stroke: none, + [ + Not optimized + ``` +407036282 L1-dcache-load-misses + +39.868545227 seconds time elapsed +39.115950000 seconds user +0.347522000 seconds sys + + ``` + ],[ + Optimzed + ``` +42027157 L1-dcache-load-misses + +4.132272210 seconds time elapsed +3.778635000 seconds user +0.296472000 seconds sys + ``` + ] + ) + There still is a 10 factor as before between the L1 cache misses. +]) + + +#task([Event analysed with `perf`:],[ + +- *Instructions*: It indicates the number of cpu instruction done during the program is running. +- *Cache-missing*: This happens when the data used is not currently store in the cache. The ask is passed to the next memory : RAM. +- *Branch-misses*: It happens when there is conditional branch. The CPU tries to predict the next instruction and misses. +- *L1-dcache-load-misses*: It happens when the data is not store in the cache L1. It has the next memory technology, here cache L2. +- *Cpu-migrations*: It indictes the number of times the program has changed of CPU thread. +- *Context-switches*: The program is sharing the resource with others. Sometimes, it less the cpu core to another. This involves a context-switching. It has to change some register like the PC. + +]) + + +#task([Timing performance of `perf`], [ + There is some executions of the optimized program: + + #figure(table( + columns: (1fr, 1fr), + // stroke: none, + [*Without `perf`*], [*With `perf`*], + [ + ``` +real 0m 4.44s +user 0m 3.83s +sys 0m 0.29s + + ``` + ], + [ + ``` +real 0m 4.38s +user 0m 4.05s +sys 0m 0.27s + + ``` + ],[ + ``` +real 0m 4.75s +user 0m 4.09s +sys 0m 0.34s + + ``` + ],[ + ``` +real 0m 4.75s +user 0m 4.09s +sys 0m 0.34s + + ``` + ], + ), + caption:[Impact of the tool `perf`] + ) + + In @impact-perf, the tool does not significantly affect program execution. It is certainly due to the CPU allocations. + +]) + diff --git a/src/05-optimization/ex01/basic.c b/src/05-optimization/ex01/basic.c new file mode 100644 index 0000000..646c561 --- /dev/null +++ b/src/05-optimization/ex01/basic.c @@ -0,0 +1,23 @@ +#include + +#define SIZE 5000 + +static int32_t array[SIZE][SIZE]; + +int main (void) +{ + int i, j, k; + + for (k = 0; k < 10; k++) + { + for (i = 0; i < SIZE; i++) + { + for (j = 0; j < SIZE; j++) + { + array[j][i]++; + } + } + } + return 0; +} + diff --git a/src/05-optimization/ex01/optimized.c b/src/05-optimization/ex01/optimized.c new file mode 100644 index 0000000..de09440 --- /dev/null +++ b/src/05-optimization/ex01/optimized.c @@ -0,0 +1,22 @@ +#include + +#define SIZE 5000 + +static int32_t array[SIZE][SIZE]; + +int main (void) +{ + int i, j; + + + for (i = 0; i < SIZE; i++) + { + for (j = 0; j < SIZE; j++) + { + array[j][i]+= 10; + } + } + + return 0; +} +