Trace for matmul_check
  1. Preprocessing loop contracts
  2. Function.inline_def [cFunDef "mm"];
  3. List.iter tile [("i", 32); ("j", 32); ("k", 4)];
  4. Loop.reorder_at ~order:["bi"; "bj"; "bk"; "i"; "k"; "j"] [cPlusEq ~lhs:[cVar "sum"] ()];
  5. Loop.hoist_expr ~dest:[tBefore; cFor "bi"] "pB" ~indep:["bi"; "i"] [cArrayRead "B"];
  6. Matrix.stack_copy ~var:"sum" ~copy_var:"s" ~copy_dims:1 [cFor ~body:[cPlusEq ~lhs:[cVar "sum"] ()] "k"];
  7. Omp.simd [nbMulti; cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "j"];
  8. Omp.parallel_for [nbMulti; cFunBody ""; cStrict; cFor ""];
  9. Loop.unroll ~simpl:Arith.do_nothing [cFor ~body:[cPlusEq ~lhs:[cVar "s"] ()] "k"];
  10. postprocessing (); )
tmp/{before67496f.cpp → afterf8ff61.cpp} RENAMED
@@ -1,33 +1,63 @@
1
  #include <optitrust.h>
2
 
3
  // NOTE: using pretty matrix notation
4
 
5
- void mm(float* C, float* A, float* B, int m, int n, int p) {
6
- __modifies("C ~> Matrix2(m, n)");
7
- __reads("A ~> Matrix2(m, p)");
8
- __reads("B ~> Matrix2(p, n)");
9
- for (int i = 0; i < m; i++) {
10
- __xmodifies("for j in 0..n -> &C[i][j] ~> Cell");
 
11
- for (int j = 0; j < n; j++) {
12
- __xmodifies("&C[i][j] ~> Cell");
13
- float sum = 0.f;
14
- for (int k = 0; k < p; k++) {
15
- const __ghost_fn focusA =
16
- __ghost_begin(matrix2_ro_focus, "M := A, i := i, j := k");
17
- const __ghost_fn focusB =
18
- __ghost_begin(matrix2_ro_focus, "M := B, i := k, j := j");
19
- sum += A[i][k] * B[k][j];
20
- __ghost_end(focusA);
21
- __ghost_end(focusB);
22
- }
23
- C[i][j] = sum;
24
  }
25
  }
26
  }
27
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
- void mm1024(float* C, float* A, float* B) {
 
 
 
 
29
- __modifies("C ~> Matrix2(1024, 1024)");
 
 
 
 
 
 
 
 
 
 
 
 
30
- __reads("A ~> Matrix2(1024, 1024)");
31
- __reads("B ~> Matrix2(1024, 1024)");
 
32
- mm(C, A, B, 1024, 1024, 1024);
 
 
 
 
 
 
 
 
33
  }
1
  #include <optitrust.h>
2
 
3
  // NOTE: using pretty matrix notation
4
 
5
+ void mm1024(float* C, float* A, float* B) {
 
 
 
 
6
+ float* const pB = (float* const)malloc(sizeof(float[32][256][4][32]));
7
+ #pragma omp parallel for
8
+ for (int bj = 0; bj < 32; bj++) {
9
+ for (int bk = 0; bk < 256; bk++) {
 
10
+ for (int k = 0; k < 4; k++) {
11
+ for (int j = 0; j < 32; j++) {
 
 
12
+ pB[32768 * bj + 128 * bk + 32 * k + j] =
13
+ B[1024 * (4 * bk + k) + 32 * bj + j];
 
 
 
 
14
  }
15
  }
16
  }
17
+ }
18
+ #pragma omp parallel for
19
+ for (int bi = 0; bi < 32; bi++) {
20
+ for (int bj = 0; bj < 32; bj++) {
21
+ float* const sum = (float* const)malloc(sizeof(float[32][32]));
22
+ for (int i = 0; i < 32; i++) {
23
+ for (int j = 0; j < 32; j++) {
24
+ sum[32 * i + j] = 0.f;
25
+ }
26
+ }
27
+ for (int bk = 0; bk < 256; bk++) {
28
+ for (int i = 0; i < 32; i++) {
29
+ float* const s = ref[32] float();
30
+ MMEMCPY(s, 0, sum, 32 * i, 32, sizeof(float));
31
+ #pragma omp simd
32
+ for (int j = 0; j < 32; j++) {
33
+ s[j] += A[1024 * (32 * bi + i) + 4 * bk] *
34
+ pB[32768 * bj + 128 * bk + j];
35
+ }
36
+ #pragma omp simd
37
+ for (int j = 0; j < 32; j++) {
38
+ s[j] += A[1 + 1024 * (32 * bi + i) + 4 * bk] *
39
+ pB[32 + 32768 * bj + 128 * bk + j];
40
+ }
41
+ #pragma omp simd
42
+ for (int j = 0; j < 32; j++) {
43
+ s[j] += A[2 + 1024 * (32 * bi + i) + 4 * bk] *
44
+ pB[64 + 32768 * bj + 128 * bk + j];
45
+ }
46
+ #pragma omp simd
47
+ for (int j = 0; j < 32; j++) {
48
+ s[j] += A[3 + 1024 * (32 * bi + i) + 4 * bk] *
49
+ pB[96 + 32768 * bj + 128 * bk + j];
50
+ }
51
+ MMEMCPY(sum, 32 * i, s, 0, 32, sizeof(float));
52
+ }
53
+ }
54
+ for (int i = 0; i < 32; i++) {
55
+ for (int j = 0; j < 32; j++) {
56
+ C[1024 * (32 * bi + i) + 32 * bj + j] = sum[32 * i + j];
57
+ }
58
+ }
59
+ free(sum);
60
+ }
61
+ }
62
+ free(pB);
63
  }