Skip to content

Commit

Permalink
[tests] Unit tests for convolution solvers from #1911, part 2 (#3318)
Browse files Browse the repository at this point in the history
  • Loading branch information
averinevg authored Oct 17, 2024
1 parent eecfb26 commit 38258d5
Show file tree
Hide file tree
Showing 35 changed files with 1,206 additions and 478 deletions.
153 changes: 83 additions & 70 deletions src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ namespace debug {
MIOPEN_EXPORT extern bool
AlwaysEnableConvDirectNaive; // NOLINT (cppcoreguidelines-avoid-non-const-global-variables)

/// WORKAROUND_SWDEV_271887 disables ConvOclDirectFwd1x1 solver on gfx10 due to precision issues.
/// However we still want to check that the solver is not broken and therefore use
/// disable_wa_swdev_271887 = true to enable it.
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
MIOPEN_INTERNALS_EXPORT extern bool disable_wa_swdev_271887;

} // namespace debug

struct AnyInvokeParams;
Expand All @@ -62,16 +68,24 @@ const int wave_size = 64;

namespace conv {

/// Base class for convolution tunable and non-tunable solvers
using ConvSolverBase = SolverMixin<ExecutionContext, miopen::conv::ProblemDescription>;
/// Common interface for convolution tunable and non-tunable solvers
using ConvSolverInterface = SolverInterface<ExecutionContext, miopen::conv::ProblemDescription>;

/// Common interface for convolution non-tunable solvers
using ConvSolverInterfaceNonTunable =
SolverInterfaceNonTunable<ExecutionContext, miopen::conv::ProblemDescription>;

/// Common interface for convolution tunable solvers
using ConvSolverInterfaceTunable =
SolverInterfaceTunable<ExecutionContext, miopen::conv::ProblemDescription>;

/// Typedef for convolution non-tunable solvers
using ConvSolver = NonTunableSolverBase<ExecutionContext, miopen::conv::ProblemDescription>;
using ConvSolver = SolverBaseNonTunable<ExecutionContext, miopen::conv::ProblemDescription>;

/// Typedef for convolution tunable solvers
template <class PerformanceConfig>
using ConvTunableSolver =
TunableSolverMixin<ExecutionContext, miopen::conv::ProblemDescription, PerformanceConfig>;
SolverBaseTunable<ExecutionContext, miopen::conv::ProblemDescription, PerformanceConfig>;

struct PerformanceConfigConvAsm3x3U : PerfConfigBase<PerformanceConfigConvAsm3x3U>
{
Expand Down Expand Up @@ -159,7 +173,6 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase<PerformanceConfigConvAsm1x1
f(self.waves_k_in_group, "waves_k_in_group");
}

// clang-format off
int GetReadSize() const { return read_size; }
int GetKMult() const { return k_mult; }
int GetChunksPerWave() const { return chunks_per_wave; }
Expand All @@ -168,8 +181,11 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase<PerformanceConfigConvAsm1x1
int GetCMult() const { return c_mult; }
int GetWavesCInGroup() const { return waves_c_in_group; }
int GetWavesKInGroup() const { return waves_k_in_group; }
int GetNPerGpr() const { assert(chunk_size); return 64 / chunk_size; }
// clang-format on
int GetNPerGpr() const
{
assert(chunk_size);
return 64 / chunk_size;
}

MIOPEN_INTERNALS_EXPORT void StaticHeuristic(const miopen::conv::ProblemDescription& problem);
MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&,
Expand Down Expand Up @@ -272,7 +288,6 @@ struct PerformanceConfigConvAsm1x1UV2 : PerfConfigBase<PerformanceConfigConvAsm1
f(self.waves_c_in_group, "waves_c_in_group");
}

// clang-format off
int GetChunkSize() const { return chunk_size; }
int GetDwordsPerLd() const { return dwords_per_ld; }
int GetCMult() const { return c_mult; }
Expand All @@ -283,8 +298,11 @@ struct PerformanceConfigConvAsm1x1UV2 : PerfConfigBase<PerformanceConfigConvAsm1
int GetHPerChunk() const { return h_per_chunk; }
int GetWavesCInGroup() const { return waves_c_in_group; }
int GetWavesKInGroup() const { return waves_k_in_group; }
int GetNPerGpr() const { assert(chunk_size); return 64 / chunk_size; }
// clang-format on
int GetNPerGpr() const
{
assert(chunk_size);
return 64 / chunk_size;
}

MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&);
MIOPEN_INTERNALS_EXPORT bool IsValidValue() const;
Expand Down Expand Up @@ -1739,12 +1757,12 @@ struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver
/// "legacy exhaustive search" machinery.
struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver<LegacyPerformanceConfig>
{
MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig GetDefaultPerformanceConfig(
const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig
Search(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const AnyInvokeParams& invoke_ctx) const override;
LegacyPerformanceConfig
GetDefaultPerformanceConfig(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
LegacyPerformanceConfig Search(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const AnyInvokeParams& invoke_ctx) const override;

private:
template <typename Tgpu>
Expand All @@ -1753,35 +1771,33 @@ struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver<LegacyPerforma
const AnyInvokeParams& invoke_ctx) const;
};

struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch
struct MIOPEN_INTERNALS_EXPORT ConvOclDirectFwd final : ConvOclDirectFwdLegacyExhaustiveSearch
{
const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclDirectFwd>(); }

MIOPEN_INTERNALS_EXPORT static ConvSolution
BaseGetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&);
static ConvSolution BaseGetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&);

MIOPEN_INTERNALS_EXPORT bool
IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&) const override;
MIOPEN_INTERNALS_EXPORT bool
IsValidPerformanceConfig(const ExecutionContext&,
bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&) const override;
bool IsValidPerformanceConfig(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&) const override;
};

struct ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch
struct MIOPEN_INTERNALS_EXPORT ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch
{
const std::string& SolverDbId() const override { return GetSolverDbId<ConvOclDirectFwd1x1>(); }

MIOPEN_INTERNALS_EXPORT bool
IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&) const override;
bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const LegacyPerformanceConfig&) const override;

bool IsValidPerformanceConfig(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
Expand Down Expand Up @@ -2194,14 +2210,17 @@ struct PerformanceConfigAsmDirect3x3WrW : PerfConfigBase<PerformanceConfigAsmDir
f(self.n_per_group, "n_per_group");
}

// clang-format off
int GetLimitWaveCnt() const { return limit_wave_cnt; }
int GetReverseInout() const { return reverse_inout; }
int GetChunkSize() const { return chunk_size; }
int GetKPerWave() const { return k_per_wave; }
int GetPipeLinesDepth() const { return pipe_lines_depth; }
int GetNPerGroup() const { return n_per_group; }
int GetCPerWave() const { assert(chunk_size); return 64 / chunk_size; } // clang-format on
int GetCPerWave() const
{
assert(chunk_size);
return 64 / chunk_size;
}

MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&,
const miopen::conv::ProblemDescription&);
Expand Down Expand Up @@ -2327,20 +2346,24 @@ struct PerformanceConfigConvAsmBwdWrW1x1 : PerfConfigBase<PerformanceConfigConvA
f(self.data_prefetch, "data_prefetch");
}

// clang-format off
int GetChunkSize() const { return chunk_size; }
int GetCPerGpr() const { return c_per_gpr; }
int GetCMult() const { return c_mult; }
int GetKPerGpr() const { return k_per_gpr; }
int GetKMult() const { return k_mult; }
int GetNPerGpr() const { return n_per_gpr; }
int GetNPartCnt() const { return n_part_cnt; }
int GetHWPerGpr() const { assert(c_per_gpr); assert(n_per_gpr); assert(chunk_size);
return wave_size / (c_per_gpr * n_per_gpr * chunk_size); } // "hw" stands for "height-and-width".
// "hw" stands for "height-and-width".
int GetHWPerGpr() const
{
assert(c_per_gpr);
assert(n_per_gpr);
assert(chunk_size);
return wave_size / (c_per_gpr * n_per_gpr * chunk_size);
}
int GetReadSize() const { return read_size; }
int GetShortStore() const {return short_store; }
int GetShortStore() const { return short_store; }
int GetDataPrefetch() const { return data_prefetch; }
// clang-format on

MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&,
const miopen::conv::ProblemDescription&);
Expand Down Expand Up @@ -2423,26 +2446,23 @@ struct PerformanceConfigConvOclBwdWrw2
f(self.n_out_rows_in_lcl, "n_out_rows_in_lcl");
}

// clang-format off
int GetNumWaves() const { return n_waves; }
int GetReadSize() const { return read_size; }
int GetNumOutChannelsPerTile() const { return n_out_channels_per_tile; }
int GetNumOutChannelTiles() const { return n_out_channels_tiles; }
int GetNumOutRowsPerIterPerWork() const { return n_out_rows_in_lcl; } // clang-format on
int GetNumOutRowsPerIterPerWork() const { return n_out_rows_in_lcl; }

MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&);
MIOPEN_INTERNALS_EXPORT bool IsValidValue() const;
MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&);
MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const;
MIOPEN_INTERNALS_EXPORT bool
operator==(const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>& other) const;
void HeuristicInit(const miopen::conv::ProblemDescription&);
bool IsValidValue() const;
bool SetNextValue(const miopen::conv::ProblemDescription&);
bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription&) const;
bool operator==(const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>& other) const;
};

template <int N_BATCH_LOOPS>
struct ConvOclBwdWrW2 : ConvTunableSolver<PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>>
struct ConvOclBwdWrW2 final : ConvTunableSolver<PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>>
{
const std::string& SolverDbId() const override
MIOPEN_INTERNALS_EXPORT const std::string& SolverDbId() const override
{
return this->template GetSolverDbId<ConvOclBwdWrW2<N_BATCH_LOOPS>>();
}
Expand All @@ -2462,14 +2482,16 @@ struct ConvOclBwdWrW2 : ConvTunableSolver<PerformanceConfigConvOclBwdWrw2<N_BATC
IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize(
const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
bool MayNeedWorkspace() const override { return true; }
MIOPEN_INTERNALS_EXPORT bool MayNeedWorkspace() const override { return true; }
MIOPEN_INTERNALS_EXPORT ConvSolution
GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&,
const PerformanceConfigConvOclBwdWrw2<N_BATCH_LOOPS>&) const override;

protected:
bool IsApplicableBase(const ExecutionContext&, const miopen::conv::ProblemDescription&) const;

friend struct ConvOclBwdWrW2NonTunable;
};

// To suppress misleading clang warnings
Expand Down Expand Up @@ -2498,29 +2520,20 @@ extern template struct ConvOclBwdWrW2<16>;
/// Basically, this is *hack* for non-group 3x3 and 1x1 cases.
/// It is assumed that Solutions provided by the ConvOclBwdWrW2 solver
/// would never beat 3x3 and 1x1 assembly WrW kernels, even after tuning.
struct ConvOclBwdWrW2NonTunable final : ConvOclBwdWrW2<1>
struct MIOPEN_INTERNALS_EXPORT ConvOclBwdWrW2NonTunable final : ConvSolver
{
const std::string& SolverDbId() const override
{
return GetSolverDbId<ConvOclBwdWrW2NonTunable>();
}

MIOPEN_INTERNALS_EXPORT bool
IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override;
MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const;
InvokerFactory GetInvokerFactory(const ExecutionContext& ctx,
const miopen::conv::ProblemDescription& problem) const
{
return *GetSolution(ctx, problem).invoker_factory;
}

private:
// This function dervied from ConvOclBwdWrW2 is declared private
// so that this solver is not marked searchable/tunable.
using ConvOclBwdWrW2<1>::GetDefaultPerformanceConfig;
using ConvOclBwdWrW2<1>::GetSolution;
using ConvOclBwdWrW2<1>::GetInvokerFactory;
bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
size_t GetWorkspaceSize(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
bool MayNeedWorkspace() const override { return true; }
ConvSolution GetSolution(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;
};

struct ConvOclBwdWrW53 final : ConvSolver
Expand Down
19 changes: 1 addition & 18 deletions src/include/miopen/generic_search.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include <miopen/invoke_params.hpp>
#include <miopen/logger.hpp>
#include <miopen/timer.hpp>
#include <miopen/type_traits.hpp>
#include <miopen/mt_queue.hpp>
#include <miopen/generic_search_controls.hpp>

Expand All @@ -55,6 +54,7 @@ namespace solver {
namespace debug {
// This struct is not MT-safe, meaning one should use it before starting threads, thus avoiding
// constructing it inside a worker thread.
/// \todo This class should be moved out of the library
struct MIOPEN_INTERNALS_EXPORT TuningIterationScopedLimiter
{
TuningIterationScopedLimiter(std::size_t new_limit);
Expand Down Expand Up @@ -241,7 +241,6 @@ class HeartBeat
/// - Its return type shall be suitable for instantiation of the ComputedContainer.
/// * GetSolution shall be implemented.
/// * Solution should provide invoker
/// * RunAndMeasureSolution must NOT be implemented. Invoker will be used instead.
///
/// clang-format-off
/// -----------------------------------------------
Expand All @@ -263,17 +262,6 @@ class HeartBeat
/// ------------------------------------------------
/// clang-format-on

template <class Solver, class Top, class Bottom>
using RunAndMeasure_t =
decltype(std::declval<Solver>().RunAndMeasureSolution(std::declval<miopen::Handle&>(),
std::declval<Bottom>(),
std::declval<Top>(),
std::declval<ConstData_t>(),
std::declval<ConstData_t>(),
std::declval<ExecutionContext>(),
std::declval<ConvSolution>(),
std::declval<float&>()));

template <class Solver, class Context, class Problem>
auto GetAllConfigs(const Solver s, const Context& context, const Problem& problem)
-> ComputedContainer<decltype(s.GetDefaultPerformanceConfig(context, problem)),
Expand Down Expand Up @@ -367,11 +355,6 @@ auto GenericSearch(const Solver s,
const AnyInvokeParams& invoke_ctx_)
-> decltype(s.GetDefaultPerformanceConfig(context_, problem))
{
static_assert(
!(HasMember<RunAndMeasure_t, Solver, ConstData_t, Data_t>{} ||
HasMember<RunAndMeasure_t, Solver, Data_t, ConstData_t>{}),
"RunAndMeasure is obsolete. Solvers should implement auto-tune evaluation in invoker");

auto context = context_;
context.is_for_generic_search = true;

Expand Down
Loading

0 comments on commit 38258d5

Please sign in to comment.