Trace for harris
  1. Variable.inline ~simpl [ctx; multi cVarDef ["h1"; "w1"; "h2"; "w2"]];
  2. List.iter fuse [ ["grayscale"], [], ["gray"]; ["sobelX"; "sobelY"], [], ["ix"; "iy"]; ["mul"; "sum3x3"; "coarsity"], overlaps_2x2 ["ixx"; "ixy"; "iyy"], ["out"]; ];
  3. Stencil.fusion_targets_tile [int 32] ~outputs:["out"] ~overlaps:["gray", [int 4]; "ix", [int 2]] [ctx; nbMulti; cFor "y"];
  4. simpl_mins [ctx];
  5. Matrix.storage_folding ~dim:0 ~size:(int 4) [ctx; multi cVarDef ["gray"; "ix"; "iy"]];
  6. Matrix.elim [ctx; multi cVarDef ["ixx"; "ixy"; "iyy"]];
  7. List.iter inline ["weights_sobelX"; "weights_sobelY"; "weights_sum3x3"];
  8. List.iter bind_gradient ["ix"; "iy"];
  9. Matrix.elim_mops [ctx];
  10. Omp.parallel_for [ctx; cFor "y"];
  11. Omp.simd ~clause:[Simdlen 8] [ctx; nbMulti; cFor "x"]; )
tmp/{beforeda6fcb.cpp → aftercd66f8.cpp} RENAMED
@@ -67,43 +67,107 @@ void coarsity(float* out, int h, int w, const float* sxx, const float* sxy,
67
  for (int y = 0; y < h; y++) {
68
  for (int x = 0; x < w; x++) {
69
  float det = sxx[y][x] * syy[y][x] - sxy[y][x] * sxy[y][x];
70
  float trace = sxx[y][x] + syy[y][x];
71
  out[y][x] = det - kappa * trace * trace;
72
  }
73
  }
74
  }
75
 
76
  void harris(float* out, int h, int w, const float* in) {
77
- const int h1 = h - 2;
78
- const int w1 = w - 2;
79
- const int h2 = h1 - 2;
80
- const int w2 = w1 - 2;
 
81
- float* const gray = (float* const)malloc(sizeof(float[h][w]));
82
- float* const ix = (float* const)malloc(sizeof(float[h1][w1]));
83
- float* const iy = (float* const)malloc(sizeof(float[h1][w1]));
84
- float* const ixx = (float* const)malloc(sizeof(float[h1][w1]));
 
85
- float* const ixy = (float* const)malloc(sizeof(float[h1][w1]));
86
- float* const iyy = (float* const)malloc(sizeof(float[h1][w1]));
87
- float* const sxx = (float* const)malloc(sizeof(float[h2][w2]));
88
- float* const sxy = (float* const)malloc(sizeof(float[h2][w2]));
89
- float* const syy = (float* const)malloc(sizeof(float[h2][w2]));
90
- grayscale(gray, h, w, in);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
- sobelX(ix, h, w, gray);
92
- sobelY(iy, h, w, gray);
93
- mul(ixx, h1, w1, ix, ix);
 
 
 
 
 
 
 
 
 
94
- mul(ixy, h1, w1, ix, iy);
 
 
 
 
 
 
 
 
 
95
- mul(iyy, h1, w1, iy, iy);
96
- sum3x3(sxx, h1, w1, ixx);
97
- sum3x3(sxy, h1, w1, ixy);
98
- sum3x3(syy, h1, w1, iyy);
 
 
 
 
 
 
 
99
- coarsity(out, h2, w2, sxx, sxy, syy, 0.0399999991059f);
 
 
 
 
 
 
100
  free(gray);
101
  free(ix);
102
  free(iy);
103
- free(ixx);
104
- free(ixy);
105
- free(iyy);
106
- free(sxx);
107
- free(sxy);
108
- free(syy);
109
  }
67
  for (int y = 0; y < h; y++) {
68
  for (int x = 0; x < w; x++) {
69
  float det = sxx[y][x] * syy[y][x] - sxy[y][x] * sxy[y][x];
70
  float trace = sxx[y][x] + syy[y][x];
71
  out[y][x] = det - kappa * trace * trace;
72
  }
73
  }
74
  }
75
 
76
  void harris(float* out, int h, int w, const float* in) {
77
+ #pragma omp parallel for
 
78
+ for (int y = 0; y < -4 + h; y += 32) {
79
+ float* const iy = (float* const)malloc(sizeof(float[4][-2 + w]));
80
+ float* const ix = (float* const)malloc(sizeof(float[4][-2 + w]));
81
+ float* const gray = (float* const)malloc(sizeof(float[4][w]));
 
 
82
+ for (int y_out = 0; y_out < min(h, 36 + y) - y; y_out++) {
83
+ #pragma omp simd simdlen(8)
84
+ for (int x = 0; x < w; x++) {
 
 
 
 
85
+ gray[y_out % 4 * w + x] =
86
+ 0.298999994993f * in[(y_out + y) * w + x] +
87
+ 0.587000012398f * in[h * w + (y_out + y) * w + x] +
88
+ 0.11400000006f * in[2 * h * w + (y_out + y) * w + x];
89
+ }
90
+ if (2 <= y_out) {
91
+ #pragma omp simd simdlen(8)
92
+ for (int x = 0; x < -2 + w; x++) {
93
+ float acc_ix = 0.f;
94
+ acc_ix += -0.0833333333333f * gray[(-2 + y_out) % 4 * w + x];
95
+ acc_ix += 0.0833333333333f * gray[2 + (-2 + y_out) % 4 * w + x];
96
+ acc_ix += -0.166666666667f * gray[(-1 + y_out) % 4 * w + x];
97
+ acc_ix += 0.166666666667f * gray[2 + (-1 + y_out) % 4 * w + x];
98
+ acc_ix += -0.0833333333333f * gray[y_out % 4 * w + x];
99
+ acc_ix += 0.0833333333333f * gray[2 + y_out % 4 * w + x];
100
+ ix[(-2 + y_out) % 4 * (-2 + w) + x] = acc_ix;
101
+ float acc_iy = 0.f;
102
+ acc_iy += -0.0833333333333f * gray[(-2 + y_out) % 4 * w + x];
103
+ acc_iy += -0.166666666667f * gray[1 + (-2 + y_out) % 4 * w + x];
104
+ acc_iy += -0.0833333333333f * gray[2 + (-2 + y_out) % 4 * w + x];
105
+ acc_iy += 0.0833333333333f * gray[y_out % 4 * w + x];
106
+ acc_iy += 0.166666666667f * gray[1 + y_out % 4 * w + x];
107
+ acc_iy += 0.0833333333333f * gray[2 + y_out % 4 * w + x];
108
+ iy[(-2 + y_out) % 4 * (-2 + w) + x] = acc_iy;
109
+ }
110
+ }
111
+ if (4 <= y_out) {
112
+ #pragma omp simd simdlen(8)
113
+ for (int x = 0; x < -4 + w; x++) {
114
+ float ix0 = ix[(-4 + y_out) % 4 * (-2 + w) + x];
115
+ float ix1 = ix[1 + (-4 + y_out) % 4 * (-2 + w) + x];
116
+ float ix2 = ix[2 + (-4 + y_out) % 4 * (-2 + w) + x];
117
+ float ix3 = ix[(-3 + y_out) % 4 * (-2 + w) + x];
118
+ float ix4 = ix[1 + (-3 + y_out) % 4 * (-2 + w) + x];
119
+ float ix5 = ix[2 + (-3 + y_out) % 4 * (-2 + w) + x];
120
+ float ix6 = ix[(-2 + y_out) % 4 * (-2 + w) + x];
121
+ float ix7 = ix[1 + (-2 + y_out) % 4 * (-2 + w) + x];
122
+ float ix8 = ix[2 + (-2 + y_out) % 4 * (-2 + w) + x];
123
+ float iy0 = iy[(-4 + y_out) % 4 * (-2 + w) + x];
124
+ float iy1 = iy[1 + (-4 + y_out) % 4 * (-2 + w) + x];
125
+ float iy2 = iy[2 + (-4 + y_out) % 4 * (-2 + w) + x];
126
+ float iy3 = iy[(-3 + y_out) % 4 * (-2 + w) + x];
127
+ float iy4 = iy[1 + (-3 + y_out) % 4 * (-2 + w) + x];
128
+ float iy5 = iy[2 + (-3 + y_out) % 4 * (-2 + w) + x];
129
+ float iy6 = iy[(-2 + y_out) % 4 * (-2 + w) + x];
130
+ float iy7 = iy[1 + (-2 + y_out) % 4 * (-2 + w) + x];
131
+ float iy8 = iy[2 + (-2 + y_out) % 4 * (-2 + w) + x];
132
+ float acc_sxx = 0.f;
 
133
+ acc_sxx += ix0 * ix0;
134
+ acc_sxx += ix1 * ix1;
135
+ acc_sxx += ix2 * ix2;
136
+ acc_sxx += ix3 * ix3;
137
+ acc_sxx += ix4 * ix4;
138
+ acc_sxx += ix5 * ix5;
139
+ acc_sxx += ix6 * ix6;
140
+ acc_sxx += ix7 * ix7;
141
+ acc_sxx += ix8 * ix8;
142
+ float acc_sxy = 0.f;
143
+ acc_sxy += ix0 * iy0;
144
+ acc_sxy += ix1 * iy1;
145
+ acc_sxy += ix2 * iy2;
146
+ acc_sxy += ix3 * iy3;
147
+ acc_sxy += ix4 * iy4;
148
+ acc_sxy += ix5 * iy5;
149
+ acc_sxy += ix6 * iy6;
150
+ acc_sxy += ix7 * iy7;
151
+ acc_sxy += ix8 * iy8;
152
+ float acc_syy = 0.f;
153
+ acc_syy += iy0 * iy0;
 
 
154
+ acc_syy += iy1 * iy1;
155
+ acc_syy += iy2 * iy2;
156
+ acc_syy += iy3 * iy3;
157
+ acc_syy += iy4 * iy4;
158
+ acc_syy += iy5 * iy5;
159
+ acc_syy += iy6 * iy6;
160
+ acc_syy += iy7 * iy7;
161
+ acc_syy += iy8 * iy8;
162
+ float det_out = acc_sxx * acc_syy - acc_sxy * acc_sxy;
163
+ float trace_out = acc_sxx + acc_syy;
164
+ out[(-4 + y_out + y) * (-4 + w) + x] =
165
+ det_out - 0.0399999991059f * trace_out * trace_out;
166
+ }
167
+ }
168
+ }
169
  free(gray);
170
  free(ix);
171
  free(iy);
172
+ }
 
 
 
 
 
173
  }