Skip to content

Commit

Permalink
force lrn vulkan accumulator always use fp32, fix #2882
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed May 6, 2021
1 parent 7a1e015 commit d0d8120
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 53 deletions.
20 changes: 10 additions & 10 deletions src/layer/vulkan/shader/lrn_norm.comp
Original file line number Diff line number Diff line change
Expand Up @@ -77,37 +77,37 @@ void main()

if (region_type == 0)
{
afp sum = afp(0.f);
float sum = 0.f;

#if NCNN_image_shader
for (int z = 0; z < local_size; z++)
{
sum += afp(texelFetch(square_workspace, ivec3(gx, gy, gz + z), 0).r);
sum += float(texelFetch(square_workspace, ivec3(gx, gy, gz + z), 0).r);
}
#else
int v_offset = gz * psc(cstep) + gy * psc(w) + gx;

for (int z = 0; z < local_size; z++)
{
sum += afp(square_workspace_data[v_offset]);
sum += float(square_workspace_data[v_offset]);

v_offset += psc(cstep);
}
#endif

const afp alpha_div_size = afp(alpha / local_size);
scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta));
const float alpha_div_size = alpha / local_size;
scale = afp(pow(bias_constant + alpha_div_size * sum, -beta));
}
else
{
afp sum = afp(0.f);
float sum = 0.f;

#if NCNN_image_shader
for (int y = 0; y < local_size; y++)
{
for (int x = 0; x < local_size; x++)
{
sum += afp(texelFetch(square_workspace, ivec3(gx + x, gy + y, gz), 0).r);
sum += float(texelFetch(square_workspace, ivec3(gx + x, gy + y, gz), 0).r);
}
}
#else
Expand All @@ -117,15 +117,15 @@ void main()
{
for (int x = 0; x < local_size; x++)
{
sum += afp(square_workspace_data[v_offset + x]);
sum += float(square_workspace_data[v_offset + x]);
}

v_offset += psc(w);
}
#endif

const afp alpha_div_size = afp(alpha / (local_size * local_size));
scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta));
const float alpha_div_size = alpha / (local_size * local_size);
scale = afp(pow(bias_constant + alpha_div_size * sum, -beta));
}

#if NCNN_image_shader
Expand Down
22 changes: 11 additions & 11 deletions src/layer/vulkan/shader/lrn_norm_across_channel_pack4.comp
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,17 @@ void main()

// support region_type == 0 only

afpvec4 sum = afpvec4(0.f);
vec4 sum = vec4(0.f);

#if NCNN_image_shader
ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3);

for (int z = 0; z < local_size; z++)
{
sum.r += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.r), 0).r);
sum.g += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.g), 0).r);
sum.b += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.b), 0).r);
sum.a += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.a), 0).r);
sum.r += texelFetch(square_workspace, ivec3(gx, gy, z4.r), 0).r;
sum.g += texelFetch(square_workspace, ivec3(gx, gy, z4.g), 0).r;
sum.b += texelFetch(square_workspace, ivec3(gx, gy, z4.b), 0).r;
sum.a += texelFetch(square_workspace, ivec3(gx, gy, z4.a), 0).r;

z4 += 1;
}
Expand All @@ -95,17 +95,17 @@ void main()

for (int z = 0; z < local_size; z++)
{
sum.r += afp(square_workspace_data[v_offset.r]);
sum.g += afp(square_workspace_data[v_offset.g]);
sum.b += afp(square_workspace_data[v_offset.b]);
sum.a += afp(square_workspace_data[v_offset.a]);
sum.r += square_workspace_data[v_offset.r];
sum.g += square_workspace_data[v_offset.g];
sum.b += square_workspace_data[v_offset.b];
sum.a += square_workspace_data[v_offset.a];

v_offset += psc(cstep);
}
#endif

const afp alpha_div_size = afp(alpha / local_size);
afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta));
const float alpha_div_size = alpha / local_size;
afpvec4 scale = afpvec4(pow(bias_constant + alpha_div_size * sum, vec4(-beta)));

#if NCNN_image_shader
afpvec4 v = image3d_ld4(bottom_blob, ivec3(gx, gy, gz));
Expand Down
40 changes: 20 additions & 20 deletions src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,22 @@ void main()

// support region_type == 0 only

afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
mat2x4 sum = mat2x4(0.f);

#if NCNN_image_shader
ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
ivec4 zz4 = z4 + 4;

for (int z = 0; z < local_size; z++)
{
sum[0].r += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.r), 0).r);
sum[0].g += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.g), 0).r);
sum[0].b += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.b), 0).r);
sum[0].a += afp(texelFetch(square_workspace, ivec3(gx, gy, z4.a), 0).r);
sum[1].r += afp(texelFetch(square_workspace, ivec3(gx, gy, zz4.r), 0).r);
sum[1].g += afp(texelFetch(square_workspace, ivec3(gx, gy, zz4.g), 0).r);
sum[1].b += afp(texelFetch(square_workspace, ivec3(gx, gy, zz4.b), 0).r);
sum[1].a += afp(texelFetch(square_workspace, ivec3(gx, gy, zz4.a), 0).r);
sum[0].r += texelFetch(square_workspace, ivec3(gx, gy, z4.r), 0).r;
sum[0].g += texelFetch(square_workspace, ivec3(gx, gy, z4.g), 0).r;
sum[0].b += texelFetch(square_workspace, ivec3(gx, gy, z4.b), 0).r;
sum[0].a += texelFetch(square_workspace, ivec3(gx, gy, z4.a), 0).r;
sum[1].r += texelFetch(square_workspace, ivec3(gx, gy, zz4.r), 0).r;
sum[1].g += texelFetch(square_workspace, ivec3(gx, gy, zz4.g), 0).r;
sum[1].b += texelFetch(square_workspace, ivec3(gx, gy, zz4.b), 0).r;
sum[1].a += texelFetch(square_workspace, ivec3(gx, gy, zz4.a), 0).r;

z4 += 1;
zz4 += 1;
Expand All @@ -104,24 +104,24 @@ void main()

for (int z = 0; z < local_size; z++)
{
sum[0].r += afp(square_workspace_data[v_offset.r]);
sum[0].g += afp(square_workspace_data[v_offset.g]);
sum[0].b += afp(square_workspace_data[v_offset.b]);
sum[0].a += afp(square_workspace_data[v_offset.a]);
sum[1].r += afp(square_workspace_data[vv_offset.r]);
sum[1].g += afp(square_workspace_data[vv_offset.g]);
sum[1].b += afp(square_workspace_data[vv_offset.b]);
sum[1].a += afp(square_workspace_data[vv_offset.a]);
sum[0].r += square_workspace_data[v_offset.r];
sum[0].g += square_workspace_data[v_offset.g];
sum[0].b += square_workspace_data[v_offset.b];
sum[0].a += square_workspace_data[v_offset.a];
sum[1].r += square_workspace_data[vv_offset.r];
sum[1].g += square_workspace_data[vv_offset.g];
sum[1].b += square_workspace_data[vv_offset.b];
sum[1].a += square_workspace_data[vv_offset.a];

v_offset += psc(cstep);
vv_offset += psc(cstep);
}
#endif

const afp alpha_div_size = afp(alpha / local_size);
const float alpha_div_size = alpha / local_size;
afpvec8 scale;
scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));
scale[0] = afpvec4(pow(bias_constant + alpha_div_size * sum[0], vec4(-beta)));
scale[1] = afpvec4(pow(bias_constant + alpha_div_size * sum[1], vec4(-beta)));

#if NCNN_image_shader
afpvec8 v = image3d_ld8(bottom_blob, ivec3(gx, gy, gz));
Expand Down
10 changes: 5 additions & 5 deletions src/layer/vulkan/shader/lrn_norm_within_channel_pack4.comp
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ void main()

// support region_type == 1 only

afpvec4 sum = afpvec4(0.f);
vec4 sum = vec4(0.f);

#if NCNN_image_shader
for (int y = 0; y < local_size; y++)
{
for (int x = 0; x < local_size; x++)
{
sum += afpvec4(texelFetch(square_workspace, ivec3(gx + x, gy + y, gz), 0));
sum += texelFetch(square_workspace, ivec3(gx + x, gy + y, gz), 0);
}
}
#else
Expand All @@ -92,15 +92,15 @@ void main()
{
for (int x = 0; x < local_size; x++)
{
sum += afpvec4(square_workspace_data[v_offset + x]);
sum += square_workspace_data[v_offset + x];
}

v_offset += psc(w);
}
#endif

const afp alpha_div_size = afp(alpha / (local_size * local_size));
afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta));
const float alpha_div_size = alpha / (local_size * local_size);
afpvec4 scale = afpvec4(pow(bias_constant + alpha_div_size * sum, vec4(-beta)));

#if NCNN_image_shader
afpvec4 v = image3d_ld4(bottom_blob, ivec3(gx, gy, gz));
Expand Down
14 changes: 7 additions & 7 deletions src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ void main()

// support region_type == 1 only

afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
mat2x4 sum = mat2x4(0.f);

#if NCNN_image_shader
for (int y = 0; y < local_size; y++)
{
for (int x = 0; x < local_size; x++)
{
sum[0] += afpvec4(texelFetch(square_workspace, ivec3((gx + x) * 2, gy + y, gz), 0));
sum[1] += afpvec4(texelFetch(square_workspace, ivec3((gx + x) * 2 + 1, gy + y, gz), 0));
sum[0] += texelFetch(square_workspace, ivec3((gx + x) * 2, gy + y, gz), 0);
sum[1] += texelFetch(square_workspace, ivec3((gx + x) * 2 + 1, gy + y, gz), 0);
}
}
#else
Expand All @@ -94,17 +94,17 @@ void main()
{
for (int x = 0; x < local_size; x++)
{
sum += afpvec8(square_workspace_data[v_offset + x]);
sum += square_workspace_data[v_offset + x];
}

v_offset += psc(w);
}
#endif

const afp alpha_div_size = afp(alpha / (local_size * local_size));
const float alpha_div_size = alpha / (local_size * local_size);
afpvec8 scale;
scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));
scale[0] = afpvec4(pow(bias_constant + alpha_div_size * sum[0], vec4(-beta)));
scale[1] = afpvec4(pow(bias_constant + alpha_div_size * sum[1], vec4(-beta)));

#if NCNN_image_shader
afpvec8 v = image3d_ld8(bottom_blob, ivec3(gx, gy, gz));
Expand Down

0 comments on commit d0d8120

Please sign in to comment.