Skip to content

Commit

Permalink
Strange behavior on CI. Every couple of gradient checks fail...
Browse files Browse the repository at this point in the history
  • Loading branch information
szaman19 committed Jun 9, 2024
1 parent 7861255 commit f279914
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Data
np.random.seed(20200115)
_num_samples = 15
_sample_dims = (15,36,1)
_sample_dims = (15,5,1)
_sample_size = functools.reduce(operator.mul, _sample_dims)
_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32)

Expand Down Expand Up @@ -103,11 +103,12 @@ def construct_model(lbann):
x = x_lbann

y = lbann.ChannelwiseSoftmax(x,
data_layout='data_parallel',
parallel_strategy=create_parallel_strategy(num_channel_groups),
name="Channelwise_softmax_distconv")
z = lbann.L2Norm2(y)
obj.append(z)
metrics.append(lbann.Metric(z, name='data-parallel layout'))
metrics.append(lbann.Metric(z, name='channelwise split distconv'))

# NumPy implementation
vals = []
Expand Down
3 changes: 1 addition & 2 deletions include/lbann/layers/misc/channelwise_softmax.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,7 @@ void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataRea
output_dims.size(),"-D output tensor");
}
}

#endif
#endif // LBANN_HAS_DISTCONV
}

#ifdef LBANN_HAS_DISTCONV
Expand Down
12 changes: 6 additions & 6 deletions src/layers/misc/distconv/distconv_channelwise_softmax.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,17 @@ namespace distconv{
template<typename Allocator>
int
ChannelwiseSoftmax<Backend, DataType>
::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_0,
::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output,
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad_0){
if (input_0.get_local_size() == 0 ||
if (output.get_local_size() == 0 ||
output_grad.get_local_size() == 0 ||
input_grad_0.get_local_size() == 0){
util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
return 1; // no op for empty inputs
}

const auto& input_0_dims = input_0.get_local_shape();
const auto& input_0_dims = output.get_local_shape();
const auto num_channels = input_0_dims[2];
const auto local_mini_batch_size = input_0_dims[3];
const auto mat_channel_size = input_0_dims[0] * input_0_dims[1];
Expand All @@ -98,9 +98,9 @@ namespace distconv{

using LocalMat = El::Matrix<DataType, El::Device::GPU>;

LocalMat local_input(mat_stride,
LocalMat local_output(mat_stride,
local_mini_batch_size,
input_0.get_buffer(),
output.get_buffer(),
mat_stride);

LocalMat local_output_grad(mat_stride,
Expand All @@ -115,7 +115,7 @@ namespace distconv{

::lbann::channelwise_softmax_bp_impl(num_channels,
mat_channel_size,
local_input,
local_output,
local_output_grad,
local_input_grad);
return 1;
Expand Down

0 comments on commit f279914

Please sign in to comment.